Nothing
# Hidden state prediction (HSP) for discrete characters using a fixed-rates continuous-time Markov model (aka. "Mk model")
# Allows missing (unknown) states for some tips
# The transition matrix can either be provided, or can be estimated via maximum-likelihood fitting.
# Returns the loglikelihood of the model, the transition matrix and (optionally) the ancestral state likelihoods ("marginal ancestral_likelihoods") for all internal nodes and tips of the tree.
# Uses the rerooting method introduced by Yang et al (1995)
hsp_mk_model = function(tree,
tip_states, # 1D numerical/character/factor array of size Ntips. Can also be NULL.
Nstates = NULL, # number of possible states. Can be NULL.
reveal_fractions = NULL, # optional numerical vector of size Nstates, indicating the reveal fractions (fraction of tips included in tree) conditional upon each tip state. This can be used to incorporate detection biases for species, depending on their state. Can also be NULL or a single number (in which case reveal fractions are assumed to be independent of tip state).
tip_priors = NULL, # 2D numerical array of size Ntips x Nstates, listing the prior likelihood of each state at each tip. Can be provided alternatively to tip_states.
rate_model = "ER", # either "ER" or "SYM" or "ARD" or "SUEDE" or "SRD" or an integer vector mapping entries of the transition matrix to a set of independent rate parameters. The format and interpretation is the same as for index.matrix generated by the function get_transition_index_matrix(..).
transition_matrix = NULL, # either NULL, or a transition matrix of size Nstates x Nstates, such that transition_matrix * p gives the rate of change of probability vector p. If NULL, the transition matrix will be fitted via maximum-likelihood. The convention is that [i,j] gives the transition rate i-->j.
include_likelihoods = TRUE, # (bool) include the marginal ancestral state likelihoods for all nodes and tips in the returned values
root_prior = "empirical", # can be 'auto', 'flat', 'stationary', 'empirical', 'likelihoods' or 'max_likelihood' or a numeric vector of size Nstates
Ntrials = 1, # (int) number of trials (starting points) for fitting the transition matrix. Only relevant if transition_matrix=NULL.
optim_algorithm = "nlminb", # either "optim" or "nlminb". What algorithm to use for fitting.
optim_max_iterations = 200, # maximum number of iterations of the optimization algorithm (per trial)
optim_rel_tol = 1e-8, # relative tolerance when optimizing the objective function
store_exponentials = TRUE,
check_input = TRUE, # (bool) perform some basic sanity checks on the input data. Set this to FALSE if you're certain your input data is valid.
Nthreads = 1){ # (integer) number of threads for running multiple fitting trials in parallel
Ntips = length(tree$tip.label);
Nnodes = tree$Nnode;
Nedges = nrow(tree$edge);
# basic error checking
if((!is.null(tip_states)) && (!is.null(tip_priors))) stop("ERROR: tip_states and tip_priors are both non-NULL, but exactly one of them should be NULL")
else if(is.null(tip_states) && is.null(tip_priors)) stop("ERROR: tip_states and tip_priors are both NULL, but exactly one of them should be non-NULL")
if((!is.null(tip_states)) && (!is.numeric(tip_states))) stop(sprintf("ERROR: tip_states must be integers"))
if(check_input){
if((!is.null(names(tip_states))) && any(names(tip_states)!=tree$tip.label)) stop("ERROR: Names in tip_states and tip labels in tree don't match (must be in the same order).")
}
if(is.null(Nstates)) Nstates = (if(is.null(tip_states)) ncol(tip_priors) else max(tip_states))
has_reveal_fractions = ((!is.null(reveal_fractions)) && (length(reveal_fractions)>1) && (!all(reveal_fractions<=0)) && (length(unique(reveal_fractions))>1))
if(has_reveal_fractions && (any(reveal_fractions>1) || any(reveal_fractions<0))) stop(sprintf("ERROR: Reveal fractions must be positive numerics between 0 and 1 (found values from %g to %g)",min(reveal_fractions),max(reveal_fractions)))
# find known_tips
if(!is.null(tip_states)){
known_tips = which(!is.na(tip_states));
}else{
known_tips = which(rowSums(is.na(tip_priors))==0);
}
if(length(known_tips)==0) stop("ERROR: All tip states are hidden");
unknown_tips = get_complement(Ntips, known_tips)
if(has_reveal_fractions){
# reveal fractions are non-NULL and non-flat, i.e. tip state knowledge is correlated with tip state
# reveal fractions (or "reveal bias") can be incorporated into the priors of tips with known & unknown state
Nknown_tips_per_state = sapply(1:Nstates, function(state) sum(tip_states[known_tips]==state));
if(any((Nknown_tips_per_state>0) & (reveal_fractions==0))) stop("ERROR: Sampling fractions for some states are zero, despite the presence of tips with that known state")
reveal_fractions = reveal_fractions*Ntips/sum(sapply(1:Nstates,function(state) (if(Nknown_tips_per_state[state]==0) 0 else Nknown_tips_per_state[state]/reveal_fractions[state]))) # rescale reveal fractions to be consistent with the number of known tips in each state within the considered tree
if(!is.null(tip_states)){
tip_priors = matrix(0, nrow=Ntips, ncol=Nstates);
tip_priors[cbind(known_tips, tip_states[known_tips])] = reveal_fractions[tip_states[known_tips]];
tip_priors[unknown_tips,] = matrix(1-reveal_fractions,nrow=length(unknown_tips),ncol=Nstates,byrow=TRUE)
}else{
# use provided priors for known_tips as is, but adjust priors of unknown_tips
tip_priors[unknown_tips,] = matrix(1-reveal_fractions,nrow=length(unknown_tips),ncol=Nstates,byrow=TRUE)
}
# perform ancestral state reconstruction on full tree, using the adjusted priors
asr_results = asr_mk_model( tree = tree,
tip_states = NULL,
Nstates = Nstates,
tip_priors = tip_priors,
rate_model = rate_model,
transition_matrix = transition_matrix,
include_ancestral_likelihoods = include_likelihoods,
reroot = TRUE,
root_prior = root_prior,
Ntrials = Ntrials,
optim_algorithm = optim_algorithm,
optim_max_iterations = optim_max_iterations,
optim_rel_tol = optim_rel_tol,
store_exponentials = store_exponentials,
check_input = check_input,
Nthreads = Nthreads);
Nstates = asr_results$Nstates
loglikelihood = asr_results$loglikelihood
transition_matrix = asr_results$transition_matrix
if((!asr_results$success) || is.null(loglikelihood) || is.nan(loglikelihood) || (include_likelihoods && is.null(asr_results$ancestral_likelihoods))){
# ASR failed
return(list(success=FALSE, Nstates=Nstates, loglikelihood=NULL, transition_matrix=NULL, likelihoods=NULL, error=sprintf("ASR failed: %s",asr_results$error)));
}
if(include_likelihoods){
# forward-project posteriors to tips with hidden state
likelihoods = matrix(0, nrow=(Ntips+Nnodes), ncol=Nstates);
if(!is.null(tip_states)){
likelihoods[cbind(known_tips, tip_states[known_tips])] = 1.0;
likelihoods[unknown_tips, ] = tip_priors[unknown_tips,]
}else{
likelihoods[1:Ntips, ] = tip_priors;
}
likelihoods[(1:Nnodes)+Ntips, ] = asr_results$ancestral_likelihoods;
likelihoods_known = rep(TRUE, times=(Ntips+Nnodes))
likelihoods_known[unknown_tips] = FALSE;
likelihoods = apply_fixed_rate_Markov_model_to_missing_clades_CPP( Ntips = Ntips,
Nnodes = Nnodes,
Nedges = Nedges,
Nstates = Nstates,
tree_edge = as.vector(t(tree$edge))-1, # flatten in row-major format and make indices 0-based
edge_length = (if(is.null(tree$edge.length)) numeric() else tree$edge.length),
transition_matrix = as.vector(t(transition_matrix)), # flatten in row-major format
exponentiation_accuracy = 1e-3,
max_polynomials = 1000,
likelihoods_known = likelihoods_known,
likelihoods = as.vector(t(likelihoods)), # flatten in row-major format
unknown_likelihoods_as_priors = TRUE);
likelihoods = matrix(likelihoods, ncol=Nstates, byrow=TRUE); # unflatten returned table
colnames(likelihoods) = colnames(asr_results$ancestral_likelihoods);
}else{
likelihoods = NULL
}
}else{
# Reveal fractions are NULL or flat, i.e. tip state knowledge is independent of tip state
# perform ASR on subtree with known tip states, then project to tips with hidden state
# This essentially uses a flat prior for all tips with unknown state
# If the subset of known tips is much smaller than the full tree, this approach is more efficient than (but equivalent to) Mk-fitting to the full tree with appropriate priors
# extract known_subtree and synchronize known_tip_states with known_subtree
extraction = get_subtree_with_tips(tree, only_tips=known_tips, omit_tips=NULL, collapse_monofurcations=FALSE, force_keep_root=TRUE);
known_subtree = extraction$subtree;
known2all_tips = extraction$new2old_tip;
known2all_nodes = extraction$new2old_node;
if(length(known_subtree$tip.label)==0) stop("ERROR: Subtree with known tip-states is empty")
if(!is.null(tip_states)){
known_tip_states = tip_states[known2all_tips]
known_tip_priors = NULL
}else{
known_tip_states = NULL
known_tip_priors = tip_priors[known2all_tips,,drop=FALSE]
}
# perform ancestral state reconstruction on known_subtree
asr_results = asr_mk_model( tree = known_subtree,
tip_states = known_tip_states,
Nstates = Nstates,
tip_priors = known_tip_priors,
rate_model = rate_model,
transition_matrix = transition_matrix,
include_ancestral_likelihoods = include_likelihoods,
reroot = TRUE,
root_prior = root_prior,
Ntrials = Ntrials,
optim_algorithm = optim_algorithm,
optim_max_iterations = optim_max_iterations,
optim_rel_tol = optim_rel_tol,
store_exponentials = store_exponentials,
check_input = check_input,
Nthreads = Nthreads);
Nstates = asr_results$Nstates
loglikelihood = asr_results$loglikelihood
transition_matrix = asr_results$transition_matrix
if((!asr_results$success) || is.null(loglikelihood) || is.nan(loglikelihood) || (include_likelihoods && is.null(asr_results$ancestral_likelihoods))){
# ASR failed
return(list(success=FALSE, Nstates=Nstates, loglikelihood=NULL, transition_matrix=NULL, likelihoods=NULL, error=sprintf("ASR failed: %s",asr_results$error)));
}
if(include_likelihoods){
# forward-project posteriors to nodes and tips with hidden state
likelihoods = matrix(0, nrow=(Ntips+Nnodes), ncol=Nstates);
if(!is.null(tip_states)){
likelihoods[cbind(known2all_tips, known_tip_states)] = 1.0;
}else{
likelihoods[known2all_tips, ] = known_tip_priors;
}
likelihoods[known2all_nodes+Ntips, ] = asr_results$ancestral_likelihoods;
likelihoods_known = rep(FALSE, times=(Ntips+Nnodes))
likelihoods_known[known2all_tips] = TRUE;
likelihoods_known[known2all_nodes+Ntips] = TRUE;
likelihoods = apply_fixed_rate_Markov_model_to_missing_clades_CPP( Ntips = Ntips,
Nnodes = Nnodes,
Nedges = Nedges,
Nstates = Nstates,
tree_edge = as.vector(t(tree$edge))-1, # flatten in row-major format and make indices 0-based
edge_length = (if(is.null(tree$edge.length)) numeric() else tree$edge.length),
transition_matrix = as.vector(t(transition_matrix)), # flatten in row-major format
exponentiation_accuracy = 1e-3,
max_polynomials = 1000,
likelihoods_known = likelihoods_known,
likelihoods = as.vector(t(likelihoods)), # flatten in row-major format
unknown_likelihoods_as_priors = FALSE);
likelihoods = matrix(likelihoods, ncol=Nstates, byrow=TRUE); # unflatten returned table
colnames(likelihoods) = colnames(asr_results$ancestral_likelihoods);
}else{
likelihoods = NULL
}
}
return(list(success = TRUE,
Nstates = Nstates,
transition_matrix = transition_matrix,
loglikelihood = loglikelihood,
likelihoods = likelihoods));
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.