R/asr_max_parsimony.R
In castor: Efficient Phylogenetics on Large Trees

Documented in asr_max_parsimony

# Maximum parsimony ansestral state reconstruction for discrete traits.
# Modification of Sankoff algorithm for reconstructing discrete ancestral states (Weighted Small Parsimony Problem)
# Sankoff's algorithm allows the inclusion of a cost matrix:
#  	transition_costs[i,j] is the cost of transitioning i-->j (ignoring edge length)
# 	If transition_costs is "all_equal", then all transitions are penalized equally (same as if transition_costs[i,j] = 1-delta_{ij})
# 	If transition_costs is "sequential", then only single-step transitions (i-->i+1) are allowed, and all are penalized equally
# 	If transition_costs is "proportional", then all transition are allowed but they are penalized proportionally to the number of steps.
#  The modification of this function is that optionally, edge lengths can be used to weight the transition costs:
#  	Longer edges imply smaller transition costs between states
#  	Specifically, the cost of transitioning is transition_cost[i,j]/(edge_length^edge_exponent)
# 	where edge_exponent can be e.g. 0 (Sankoff's original algorithm), 1 (linear weighting) or 0.5 (square-root weighting, corresponding to a Brownian motion)
#  Requirements:
# 	Tree can be multifurcating, and can also include nodes with a single child
#	Tree must be rooted.
# 	If (edge_exponent>0) then: All edges must have length > 0
#  For a description of the original Sankoff algorithm, see: 
# 	http://telliott99.blogspot.ca/2010/03/fitch-and-sankoff-algorithms-for.html
# 	(page 11) https://cs.brown.edu/courses/csci1950-z/slides/CSCI1950ZFall09_Lecture2.pdf
#  The function returns ancestral state probabilities as a (non-flattened) NumericMatrix of size Nnodes x Nstates.
asr_max_parsimony = function(	tree, 
								tip_states, 			# integer vector of size Ntips
								Nstates				= NULL, 
								transition_costs	= "all_equal", 
								edge_exponent		= 0,
								weight_by_scenarios	= TRUE,
								check_input			= TRUE){
	Ntips  = length(tree$tip.label)
	Nedges = nrow(tree$edge)

	# basic error checking
	if(length(tip_states)!=Ntips) stop(sprintf("ERROR: Length of tip_states (%d) is not the same as the number of tips in the tree (%d)",length(tip_states),Ntips));
	if(!is.numeric(tip_states)) stop(sprintf("ERROR: tip_states must be integers"))	
	if(is.null(Nstates)) Nstates = max(tip_states);
	if(check_input){
		min_tip_state = min(tip_states)
		max_tip_state = max(tip_states)
		if((min_tip_state<1) || (max_tip_state>Nstates)) stop(sprintf("ERROR: tip_states must be integers between 1 and %d, but found values between %d and %d",Nstates,min_tip_state,max_tip_state))
		if((!is.null(names(tip_states))) && any(names(tip_states)!=tree$tip.label)) stop("ERROR: Names in tip_states and tip labels in tree don't match (must be in the same order).")
	}
	
	# construct transition_costs matrix if needed
	if(is.character(transition_costs)){
		if(transition_costs=="all_equal"){
			# all transitions penalized equally
			transition_costs = matrix(1, nrow=Nstates, ncol=Nstates)
		}else if(transition_costs=="sequential"){
			# only single-step transitions are allowed, and all are penalized equally
			transition_costs = matrix(Inf, nrow=Nstates, ncol=Nstates)
			for(i in 1:Nstates){
				if(i<Nstates) transition_costs[i,i+1] = 1
				if(i>1) transition_costs[i,i-1] 	  = 1
			}
		}else if(transition_costs=="proportional"){
			# all transitions are allowed, but penalized proportional to the number of steps
			transition_costs = matrix(0, nrow=Nstates, ncol=Nstates)
			for(i in 1:Nstates){
				transition_costs[i,] = sapply(1:Nstates, function(j) abs(j-i))
			}
		}else if(transition_costs=="exponential"){
			# all transitions are allowed, but penalized exponentially to the number of steps
			transition_costs = matrix(0, nrow=Nstates, ncol=Nstates)
			for(i in 1:Nstates){
				transition_costs[i,] = sapply(1:Nstates, function(j) exp(abs(j-i)))
			}
		}else{
			stop(sprintf("ERROR: Uknown transition_costs '%s'",transition_costs));
		}
		diag(transition_costs) = 0; # no cost for staying in the same state
	}else{
		if(nrow(transition_costs)!=Nstates || ncol(transition_costs)!=Nstates) stop(sprintf("ERROR: Transition costs has wrong size (%d x %d), expected %d x %d",nrow(transition_costs),ncol(transition_costs),Nstates,Nstates));
		if(check_input){
			if(any(transition_costs<0)) stop(sprintf("ERROR: Some transition costs are negative (found value %g)",min(transition_costs)))
			if(any(diag(transition_costs)!=0)) stop(sprintf("ERROR: The diagonal of transition_costs includes non-zero values, which makes no sense in a maximum-parsimony model"))
			if((edge_exponent!=0) && (!is.null(tree$edge.length)) && (any(tree$edge.length==0))) stop(sprintf("ERROR: edge_exponent is non-zero, but some edges in the tree have zero length"))
		}
	}

	results = WMPR_ASR_CPP(	Ntips			 						= Ntips,
							Nnodes			 						= tree$Nnode,
							Nedges			 						= Nedges,
							Nstates			 						= Nstates,
							tree_edge 		 						= as.vector(t(tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
							edge_length		 						= (if(is.null(tree$edge.length)) numeric() else tree$edge.length),
							tip_states		 						= tip_states-1,
							transition_costs 						= as.vector(t(transition_costs)),
							branch_length_exponent 					= edge_exponent,
							weight_posteriors_by_scenario_counts	= weight_by_scenarios,	# (INPUT) if true, then the posterior_probability of a state (in a specific node) is proportional to the number of scenarios in which the trait is at that state
							verbose									= FALSE,
							verbose_prefix							= "");
	return(list(success 				= TRUE,
				ancestral_likelihoods 	= results$posterior_probabilities,
				scenario_counts			= matrix(results$scenario_counts, ncol=Nstates, byrow=TRUE),
				ancestral_states		= sapply(seq_len(tree$Nnode), FUN=function(node) which.max(results$posterior_probabilities[node,])), # maximum-likelihood ancestral states
				total_cost 				= results$best_root_cost));
}