R/hsp_mk_model.R
In castor: Efficient Phylogenetics on Large Trees

Documented in hsp_mk_model

# Hidden state prediction (HSP) for discrete characters using a fixed-rates continuous-time Markov model (aka. "Mk model")
# Allows missing (unknown) states for some tips
# The transition matrix can either be provided, or can be estimated via maximum-likelihood fitting.
# Returns the loglikelihood of the model, the transition matrix and (optionally) the ancestral state likelihoods ("marginal ancestral_likelihoods") for all internal nodes and tips of the tree.
# Uses the rerooting method introduced by Yang et al (1995)
hsp_mk_model = function(tree, 
						tip_states,									# 1D numerical/character/factor array of size Ntips. Can also be NULL.
						Nstates 				= NULL,				# number of possible states. Can be NULL.
						reveal_fractions		= NULL,				# optional numerical vector of size Nstates, indicating the reveal fractions (fraction of tips included in tree) conditional upon each tip state. This can be used to incorporate detection biases for species, depending on their state. Can also be NULL or a single number (in which case reveal fractions are assumed to be independent of tip state).
						tip_priors 				= NULL, 			# 2D numerical array of size Ntips x Nstates, listing the prior likelihood of each state at each tip. Can be provided alternatively to tip_states.
						rate_model 				= "ER",				# either "ER" or "SYM" or "ARD" or "SUEDE" or "SRD" or an integer vector mapping entries of the transition matrix to a set of independent rate parameters. The format and interpretation is the same as for index.matrix generated by the function get_transition_index_matrix(..).
						transition_matrix		= NULL,				# either NULL, or a transition matrix of size Nstates x Nstates, such that transition_matrix * p gives the rate of change of probability vector p. If NULL, the transition matrix will be fitted via maximum-likelihood. The convention is that [i,j] gives the transition rate i-->j.
						include_likelihoods 	= TRUE,				# (bool) include the marginal ancestral state likelihoods for all nodes and tips in the returned values
						root_prior 				= "empirical",		# can be 'auto', 'flat', 'stationary', 'empirical', 'likelihoods' or 'max_likelihood' or a numeric vector of size Nstates
						Ntrials 				= 1,				# (int) number of trials (starting points) for fitting the transition matrix. Only relevant if transition_matrix=NULL.
						optim_algorithm 		= "nlminb",			# either "optim" or "nlminb". What algorithm to use for fitting.
						optim_max_iterations	= 200,				# maximum number of iterations of the optimization algorithm (per trial)
						optim_rel_tol			= 1e-8,				# relative tolerance when optimizing the objective function
						store_exponentials 		= TRUE,
						check_input 			= TRUE,				# (bool) perform some basic sanity checks on the input data. Set this to FALSE if you're certain your input data is valid.
						Nthreads 				= 1){				# (integer) number of threads for running multiple fitting trials in parallel
    Ntips	= length(tree$tip.label)
    Nnodes 	= tree$Nnode
    Nedges 	= nrow(tree$edge)
	
	# basic error checking
	if((!is.null(tip_states)) && (!is.null(tip_priors))) stop("ERROR: tip_states and tip_priors are both non-NULL, but exactly one of them should be NULL")
	else if(is.null(tip_states) && is.null(tip_priors))  stop("ERROR: tip_states and tip_priors are both NULL, but exactly one of them should be non-NULL")
	if((!is.null(tip_states)) && (!is.numeric(tip_states))) stop(sprintf("ERROR: tip_states must be integers"))
	if(check_input){
		if((!is.null(names(tip_states))) && any(names(tip_states)!=tree$tip.label)) stop("ERROR: Names in tip_states and tip labels in tree don't match (must be in the same order).")
	}
	if(is.null(Nstates)) Nstates = (if(is.null(tip_states)) ncol(tip_priors) else max(tip_states))
	has_reveal_fractions  = ((!is.null(reveal_fractions)) && (length(reveal_fractions)>1) && (!all(reveal_fractions<=0)) && (length(unique(reveal_fractions))>1))
	if(has_reveal_fractions && (any(reveal_fractions>1) || any(reveal_fractions<0))) stop(sprintf("ERROR: Reveal fractions must be positive numerics between 0 and 1 (found values from %g to %g)",min(reveal_fractions),max(reveal_fractions)))
	
	# find known_tips
	if(!is.null(tip_states)){
		known_tips = which(!is.na(tip_states));
	}else{
		known_tips = which(rowSums(is.na(tip_priors))==0);
	}
	if(length(known_tips)==0) return(list(success=FALSE, error="All tip states are hidden"))
	else if(length(known_tips)==1) return(list(success=FALSE, error="Only one tip has known state, which is insufficient for Mk HSP"))
	unknown_tips = get_complement(Ntips, known_tips)
	
	if(has_reveal_fractions){
		# reveal fractions are non-NULL and non-flat, i.e. tip state knowledge is correlated with tip state
		# reveal fractions (or "reveal bias") can be incorporated into the priors of tips with known & unknown state
		Nknown_tips_per_state = sapply(1:Nstates, function(state) sum(tip_states[known_tips]==state));
		if(any((Nknown_tips_per_state>0) & (reveal_fractions==0))) stop("ERROR: Sampling fractions for some states are zero, despite the presence of tips with that known state")
		reveal_fractions = reveal_fractions*Ntips/sum(sapply(1:Nstates,function(state) (if(Nknown_tips_per_state[state]==0) 0 else Nknown_tips_per_state[state]/reveal_fractions[state]))) # rescale reveal fractions to be consistent with the number of known tips in each state within the considered tree
		if(!is.null(tip_states)){
			tip_priors = matrix(0, nrow=Ntips, ncol=Nstates);
			tip_priors[cbind(known_tips, tip_states[known_tips])] = reveal_fractions[tip_states[known_tips]];
			tip_priors[unknown_tips,] = matrix(1-reveal_fractions,nrow=length(unknown_tips),ncol=Nstates,byrow=TRUE)
		}else{
			# use provided priors for known_tips as is, but adjust priors of unknown_tips
			tip_priors[unknown_tips,] = matrix(1-reveal_fractions,nrow=length(unknown_tips),ncol=Nstates,byrow=TRUE)			
		}
		# perform ancestral state reconstruction on full tree, using the adjusted priors
		asr_results = asr_mk_model(	tree					= tree, 
									tip_states		 		= NULL,
									Nstates 				= Nstates,
									tip_priors 				= tip_priors,
									rate_model 				= rate_model,
									transition_matrix 		= transition_matrix,
									include_ancestral_likelihoods = include_likelihoods,
									reroot 					= TRUE,
									root_prior 				= root_prior,
									Ntrials 				= Ntrials,
									optim_algorithm 		= optim_algorithm,
									optim_max_iterations	= optim_max_iterations,
									optim_rel_tol			= optim_rel_tol,
									store_exponentials 		= store_exponentials,
									check_input 			= check_input,
									Nthreads				= Nthreads)
		Nstates 			= asr_results$Nstates
		loglikelihood 		= asr_results$loglikelihood
		transition_matrix 	= asr_results$transition_matrix
		if((!asr_results$success) || is.null(loglikelihood) || is.nan(loglikelihood) || (include_likelihoods && is.null(asr_results$ancestral_likelihoods))){
			# ASR failed
			return(list(success=FALSE, Nstates=Nstates, loglikelihood=NULL, transition_matrix=NULL, likelihoods=NULL, error=sprintf("ASR failed: %s",asr_results$error)));
		}
		if(include_likelihoods){
			# forward-project posteriors to tips with hidden state
			likelihoods = matrix(0, nrow=(Ntips+Nnodes), ncol=Nstates);
			if(!is.null(tip_states)){
				likelihoods[cbind(known_tips, tip_states[known_tips])] = 1.0;
				likelihoods[unknown_tips, ] = tip_priors[unknown_tips,]
			}else{
				likelihoods[1:Ntips, ] = tip_priors;
			}
			likelihoods[(1:Nnodes)+Ntips, ]	= asr_results$ancestral_likelihoods;
			likelihoods_known				= rep(TRUE, times=(Ntips+Nnodes))
			likelihoods_known[unknown_tips]	= FALSE;
			likelihoods = apply_fixed_rate_Markov_model_to_missing_clades_CPP(	Ntips 					= Ntips,
																				Nnodes					= Nnodes,
																				Nedges					= Nedges,
																				Nstates					= Nstates,
																				tree_edge				= as.vector(t(tree$edge))-1,	# flatten in row-major format and make indices 0-based
																				edge_length 			= (if(is.null(tree$edge.length)) numeric() else tree$edge.length),
																				transition_matrix 		= as.vector(t(transition_matrix)), # flatten in row-major format
																				exponentiation_accuracy	= 1e-3,
																				max_polynomials			= 1000,
																				likelihoods_known		= likelihoods_known,
																				likelihoods				= as.vector(t(likelihoods)), # flatten in row-major format	
																				unknown_likelihoods_as_priors = TRUE);
			likelihoods = matrix(likelihoods, ncol=Nstates, byrow=TRUE); # unflatten returned table
			colnames(likelihoods) = colnames(asr_results$ancestral_likelihoods);
		}else{
			likelihoods = NULL
		}

	}else{
		# Reveal fractions are NULL or flat, i.e. tip state knowledge is independent of tip state
		# perform ASR on subtree with known tip states, then project to tips with hidden state
		# This essentially uses a flat prior for all tips with unknown state
		# If the subset of known tips is much smaller than the full tree, this approach is more efficient than (but equivalent to) Mk-fitting to the full tree with appropriate priors

		# extract known_subtree and synchronize known_tip_states with known_subtree
		extraction	 		= get_subtree_with_tips(tree, only_tips=known_tips, omit_tips=NULL, collapse_monofurcations=FALSE, force_keep_root=TRUE);
		known_subtree		= extraction$subtree;
		known2all_tips		= extraction$new2old_tip;
		known2all_nodes		= extraction$new2old_node;
		if(length(known_subtree$tip.label)==0) return(list(success=FALSE, error="ERROR: Subtree with known tip-states is empty"))
		if(!is.null(tip_states)){
			known_tip_states	= tip_states[known2all_tips]
			known_tip_priors	= NULL
		}else{
			known_tip_states	= NULL
			known_tip_priors	= tip_priors[known2all_tips,,drop=FALSE]
		}
	
		# perform ancestral state reconstruction on known_subtree
		asr_results = asr_mk_model(	tree					= known_subtree, 
									tip_states		 		= known_tip_states,
									Nstates 				= Nstates,
									tip_priors 				= known_tip_priors,
									rate_model 				= rate_model,
									transition_matrix 		= transition_matrix,
									include_ancestral_likelihoods = include_likelihoods,
									reroot 					= TRUE,
									root_prior 				= root_prior,
									Ntrials 				= Ntrials,
									optim_algorithm 		= optim_algorithm,
									optim_max_iterations	= optim_max_iterations,
									optim_rel_tol			= optim_rel_tol,
									store_exponentials 		= store_exponentials,
									check_input 			= check_input,
									Nthreads				= Nthreads);
		Nstates 			= asr_results$Nstates
		loglikelihood 		= asr_results$loglikelihood
		transition_matrix 	= asr_results$transition_matrix
		if((!asr_results$success) || is.null(loglikelihood) || is.nan(loglikelihood) || (include_likelihoods && is.null(asr_results$ancestral_likelihoods))){
			# ASR failed
			return(list(success=FALSE, Nstates=Nstates, loglikelihood=NULL, transition_matrix=NULL, likelihoods=NULL, error=sprintf("ASR failed: %s",asr_results$error)));
		}
		
		if(include_likelihoods){
			# forward-project posteriors to nodes and tips with hidden state
			likelihoods = matrix(0, nrow=(Ntips+Nnodes), ncol=Nstates);
			if(!is.null(tip_states)){
				likelihoods[cbind(known2all_tips, known_tip_states)] = 1.0;
			}else{
				likelihoods[known2all_tips, ] = known_tip_priors;
			}
			likelihoods[known2all_nodes+Ntips, ]		= asr_results$ancestral_likelihoods;
			likelihoods_known							= rep(FALSE, times=(Ntips+Nnodes))
			likelihoods_known[known2all_tips]			= TRUE;
			likelihoods_known[known2all_nodes+Ntips]  	= TRUE;
			likelihoods = apply_fixed_rate_Markov_model_to_missing_clades_CPP(	Ntips 					= Ntips,
																				Nnodes					= Nnodes,
																				Nedges					= Nedges,
																				Nstates					= Nstates,
																				tree_edge				= as.vector(t(tree$edge))-1,	# flatten in row-major format and make indices 0-based
																				edge_length 			= (if(is.null(tree$edge.length)) numeric() else tree$edge.length),
																				transition_matrix 		= as.vector(t(transition_matrix)), # flatten in row-major format
																				exponentiation_accuracy	= 1e-3,
																				max_polynomials			= 1000,
																				likelihoods_known		= likelihoods_known,
																				likelihoods				= as.vector(t(likelihoods)), # flatten in row-major format	
																				unknown_likelihoods_as_priors = FALSE);
			likelihoods = matrix(likelihoods, ncol=Nstates, byrow=TRUE); # unflatten returned table
			colnames(likelihoods) = colnames(asr_results$ancestral_likelihoods)
			states = sapply(seq_len(Ntips+Nnodes), FUN=function(n) which.max(likelihoods[n,]))
		}else{
			likelihoods = NULL
			states = NULL
		}
	}
		
	return(list(success				= TRUE,
				Nstates				= Nstates,
				transition_matrix	= transition_matrix, 
				loglikelihood 		= loglikelihood, 
				likelihoods			= likelihoods,
				states				= states)) # maximum-likelihood tip & node states
}