# Input
# tree a phylogenetic tree as an object of class "phylo" (from package
# ape)
sfreemapr.map <- function(tree, tip_states, Q, ...) {
# Should this program run in parallel?
parallel <- TRUE
if (hasArg(parallel)) {
parallel <- list(...)$parallel
}
# tree sanity check
if (inherits(tree, "multiPhylo")) {
# For Just call the same program multiple times...
if (parallel == TRUE) {
cores <- detectCores()
mtrees <- mclapply(tree, sfreemapr.map, tip_states, Q, ..., mc.cores=cores)
} else {
mtrees <- lapply(tree, sfreemapr.map, tip_states, Q, ...)
}
# When Q=mcmc we will have length(trees)*n_simulation trees at the end
# of the execution. Instead of lists of multPhylo objects we want to
# return one multiPhylo object with all trees.
if (Q == 'mcmc') {
mtrees <- c(mapply(c, mtrees))
}
class(mtrees) <- "multiPhylo"
return(mtrees)
} else if (!inherits(tree, "phylo")) {
stop("'tree' should be an object of class 'phylo'")
} else if (!is.rooted(tree)) {
stop("'tree' must be rooted")
}
# Defining the prior distribution for the root node of the tree,
# also known as "pi"
prior <- "equal"
if (hasArg(prior)) {
prior <- list(...)$prior
}
# tol gives the tolerance for zero elements in Q.
# (Elements less then tol will be reset to tol)
tol <- 1e-8
if (hasArg(tol)) {
tol <- list(...)$tol
}
# prior a list containing alpha and beta parameters for the gamma
# prior distribution on the transition rates in Q. Note that alpha
# and beta can be single values or vectors, if different prior
# are desired for each value in Q
gamma_prior <- list(alpha=1, beta=1, use.empirical=FALSE)
if (hasArg(gamma_prior)) {
pr <- list(...)$gamma_prior
gamma_prior[names(pr)] <- pr
}
# burn_in for the MCMC
burn_in <- 1000
if (hasArg(burn_in)) {
burn_in <- list(...)$burn_in
}
# sample_freq for the MCMC
sample_freq <- 100
if (hasArg(sample_freq)) {
sample_freq <- list(...)$sample_freq
}
# number os simulations for the MCMC
n_simulations <- 100
if (hasArg(n_simulations)) {
n_simulations <- list(...)$n_simulations
}
# For now only "symmetrical" model is accepted
model <- "SYM"
#if (hasArg(model)) {
# model <- list(...)$model
#}
# A single numeric value or a vector containing the (normal)
# sampling variances for the MCMC
vQ <- 0.1
if (hasArg(vQ)) {
vQ <- list(...)$vQ
}
# Define the tip states as a matrix
if (!is.matrix(tip_states)) {
tip_states <- build_states_matrix(tree, tip_states)
}
# Defining Q
if (is.character(Q) && (Q == "empirical")) {
# Phytools would replicate this result nsim times, but for now
# we will return just one result.
QP <- Q_empirical(tree, tip_states, prior, model, tol)
} else if (is.character(Q) && (Q == "mcmc")) {
# TODO: This function will generate many Qs. We have to decide how to
# deal with it. Maybe just run the program for every Q?
QP <- Q_mcmc(tree, tip_states, model, prior, gamma_prior, tol, burn_in
, sample_freq, vQ, n_simulations)
# Call sfreemapr.map for each {Q,prior} returned by the mcmc simulation
params <- list(...)
params$tree <- tree
params$tip_states <- tip_states
res <- function(QP) {
params$Q <- QP$Q
params$prior <- QP$prior
return (do.call(sfreemapr.map, params))
}
if (parallel == TRUE) {
mtrees <- mclapply(QP, res, mc.cores=detectCores())
} else {
mtrees <- lapply(QP, res)
}
class(mtrees) <- "multiPhylo"
return(mtrees)
} else if (is.matrix(Q)) {
# Phytools would replicate this result nsim times, but for now
# we will return just one result.
QP <- Q_matrix(tree, tip_states, Q, model, prior, tol)
} else {
stop("Unrecognized format for 'Q'")
}
# Set the final value
Q <- QP$Q
prior <- QP$prior
logL <- QP$logL
# Vector with rewards
rewards <- rep(1,nrow(Q))
if (hasArg(rewards)) {
rewards <- list(...)$rewards
if (length(rewards) != nrow(Q)) {
stop("The rewards vector should represent the states")
}
}
names(rewards) <- colnames(Q)
# Defining the transitions of interest. By default, all transitions.
QL <- matrix(1, nrow=nrow(Q), ncol=ncol(Q))
diag(QL) <- 0
if (hasArg(QL)) {
QL <- list(...)$QL
if (!all(dim(Q) == dim(QL))) {
stop("QL must have same dimensions as Q")
}
}
rownames(QL) <- colnames(QL) <- rownames(Q)
# Acquire more info about the tree.
tree_extra <- list(
states = tip_states
, n_states = nrow(Q)
, n_edges = length(tree$edge.length)
, n_tips = nrow(tip_states)
, n_nodes = nrow(tip_states) + tree$Nnode
, rewards = rewards
)
# Reorder the tree so the root is the first row of the matrix.
# We save the original order to make sure we have the result
# in same order of the tree;
tree <- reorder(tree, 'pruningwise')
# Step 1
# Compute Eigen values and Eigen vectors for the transition rate matrix
# Q, assuming Q is symmetrical
Q_eigen <- eigen(Q, TRUE, only.values = FALSE)
# The inverse of Q_eigen vectors
Q_eigen[['vectors_inv']] <- solve(Q_eigen$vectors)
# Step 2
# Compute P(tp), the transistion probability, for each edge length t of T
# Q = U X diag(d1,...,dm) X U**-1
# U are the eigenvectors of Q
# d1,...,dm are the eigenvalues of Q
# diag(d1,...,dm) is a diagonal matrix with d1,...,dm on it's main
# diagonal
# For now I'm doing this just where it is needed, in the
# fractional_likelihood calculation. I don't know yet if I'm going
# to need it somewhere else. It might be useful to compute it here
# and use it in all different places if that's the case.
MAP <- list()
MAP[['Q']] <- Q
MAP[['prior']] <- prior
# Transistion probabilities
MAP[['tp']] <- transition_probabilities(Q_eigen, tree$edge.length)
# Step 4 and 5
# Traverse the tree once and calculate Fu and Sb for each node u and
# each edge b;
# Compute the data likelihood Pr(D) as the dot product of Froot and root
# distribution pi.
MAP[['fl']] <- fractional_likelihoods(tree, tree_extra, Q, Q_eigen
, prior, MAP$tp, tol)
# FIXME: for some unknown reason if I remove this line I get a 'out of
# bounds' error in posterior_restricted_moment(). This line doesn't need to
# exist
MAP[['h']] <- list()
# Build dwelling times
tree[['mapped.edge']] <- tree[['mapped.edge.lmt']] <- tname <- NULL
states <- rownames(QL)
n_states <- tree_extra$n_states
multiplier <- matrix(0, nrow=n_states, ncol=n_states)
for (i in 1:n_states) {
for (j in 1:n_states) {
if (i == j) {
value <- rewards[i] # dwelling times
} else {
value <- QL[i,j] * Q[i,j] # number of transitions
}
if (value == 0) {
next
}
multiplier[i,j] <- value
# Step 3
h <- func_H(multiplier, Q_eigen, tree, tree_extra, tol)
prm <- posterior_restricted_moment(h, tree, tree_extra, MAP)
ev <- expected_value(tree, Q, MAP, prm)
if (i == j) {
# dwelling times
tree[['mapped.edge']] <- cbind(tree[['mapped.edge']], ev)
} else {
# number of transitions
state_from <- states[i]
state_to <- states[j]
tname <- c(tname, paste(state_from, state_to, sep=','))
tree[['mapped.edge.lmt']] <- cbind(tree[['mapped.edge.lmt']], ev)
}
multiplier[i,j] <- 0
}
}
colnames(tree[['mapped.edge']]) <- names(rewards)
colnames(tree[['mapped.edge.lmt']]) <- tname
# Return the tree in the original order
return (sfreemapr.reorder(tree, 'cladewise'))
}
# The final answer!
expected_value <- function(tree, Q, map, prm) {
likelihood <- map[['fl']][['L']]
ret <- prm / likelihood
# the rownames of the mapped objects
names(ret) <- paste(tree$edge[,1], ",", tree$edge[,2], sep="")
return (ret)
}
# Compute the expected value for labelled markov transitions and
# evolutionary reward, given the fractional and directional likelihoods
posterior_restricted_moment <- function(multiplier, tree, tree_extra, map) {
# Allocate vector of size tree_extra$n_edges. Index i will hold the expected value
# of the ith edge of the tree
ret <- rep(0, tree_extra$n_edges)
fl <- map[['fl']] # fractional likelihoods
F <- fl[['F']]
G <- fl[['G']]
S <- fl[['S']]
# These tree lines below are responsible to generate a matrix similar to
# tree$edge, but with an additional columns representing the 'brother'
# of the node.
get_brother <- function(x) return(tree$edge[c(x+1,x),2])
in_pairs <- seq(1, nrow(tree$edge), 2)
edge <- cbind(tree$edge, sapply(sapply(in_pairs, get_brother), rbind))
for (i in 1:tree_extra$n_edges) {
p <- edge[i,1] # parent node
c <- edge[i,2] # child node
b <- edge[i,3] # brother node
partial <- multiplier[,,i]
# Equation 3.8 in the paper
for (j in 1:tree_extra$n_states) {
gs <- G[p,j] * S[b,j] * F[c,]
ret[i] <- ret[i] + sum(gs * partial[j,])
}
}
return (ret)
}
# The vector of forward, often called partial or fractional likelihood.
fractional_likelihoods <- function(tree, tree_extra, Q, Q_eigen, prior, Tp, tol) {
# F is the vector of forward, often called partial or fractional,
# likelihoods at node u. Element F ui is the probability of the
# observed data at only the tips that descend from the node u,
# given that the state of u is i.
F <- matrix(0, tree_extra$n_nodes, tree_extra$n_states)
# Directional likelihoods
S <- matrix(0, tree_extra$n_nodes, tree_extra$n_states)
# G is the probability of observing state i at node
# u together with other tip states on the subtree of t
# obtained by removing all lineages downstream of node u.
G <- matrix(0, tree_extra$n_nodes, tree_extra$n_states)
# Set col names, just to make it easy to debug
states <- colnames(F) <- colnames(S) <- colnames(G) <- colnames(Q)
# Init F matrix with tip values
# As stated in the article, when value is ambiguous, we set all ambiguous
# values to 1 (100% chance)
F[1:tree_extra$n_tips,] <- tree_extra$states
# Compute F
for (e in seq(1, tree_extra$n_edges, 2)) {
u <- tree$edge[e,1] # parent node
right <- tree$edge[e,2] # right node
left <- tree$edge[e+1,2] # left node
# NOTE: this loop can possibly be vectorized...
tright <- Tp[,,e] # transistion probability for edge p -> right
tleft <- Tp[,,e+1] # transistion probability for edge p -> left
for (i in 1:tree_extra$n_states) {
S[right,i] <- sum(F[right,] * tright[i,])
S[left,i] <- sum(F[left,] * tleft[i,])
}
# When values are smaller than tol set them to tol, otherwise the
# likelihood will be zero, we will have division by zero and the world
# will end
F[u,] <- S[right,] * S[left,]
}
# Get the root node number
# Remember that the tree is ordered pruninwise
root <- tail(tree$edge, n=1)[1,1]
# Compute the likelihood of the tree
L <- sum(F[root,]*prior)
# Compute G using F and S
# Article: "Computational Advances in Maximum Likelihood Methods for
# Molecular Phylogeny", authors Eric E. Schadt, Janet S. Sinsheimer and
# Kenneth Lange
G[root,] <- prior
for (e in seq(tree_extra$n_edges, 1, -2)) {
p <- tree$edge[e,1] # parent node
left <- tree$edge[e,2] # left node
right <- tree$edge[e-1,2] # right node
# TODO: Vectorize this...
tleft <- Tp[,,e]
tright <- Tp[,,e-1]
for (i in 1:tree_extra$n_states) {
G[left,i] <- sum(G[p,]*S[right,]*tleft[i,])
G[right,i] <- sum(G[p,]*S[left,]*tright[i,])
}
}
return (list(F=F, G=G, S=S, L=L))
}
# Input
# Q_eigen the eigen deposition of Q
# t a edge length
#
# output A tridimentional matrix with the transition probabilities P
# for every the edge length t of the tree.
transition_probabilities <- function(Q_eigen, edges) {
# P(t) = U X diag(e**d1t, ... e**dmt) X U**-1
# t is an arbitrary edge length
# exp(n) is nth power of e (euler number)
n_edges <- length(edges)
P <- array(0, dim = c(dim(Q_eigen$vectors), n_edges))
# Initialize d
d <- diag(Q_eigen$values)
# Iterate over branch length
# NOTE: vectorize this...
for (i in 1:n_edges) {
diag(d) <- exp(Q_eigen$values * edges[i])
P[,,i] <- Q_eigen$vectors %*% d %*% Q_eigen$vectors_inv
}
return(P)
}
# The expected number of labelled markov transitions and expected markov rewards
# TODO: find a better name for this function =P
func_H <- function(multiplier, Q_eigen, tree, tree_extra, tol) {
ret <- array(0, dim = c(tree_extra$n_states, tree_extra$n_states, tree_extra$n_edges))
# Just an alias, to make it easier to read the formulas below
d <- Q_eigen$values
# 1. Get the transitions that happened in a edge
# TODO: This matrix has only one entry different from zero, there
# might be a faster way to calculate Si and Sj without multiplying
# the entire matrix
gen_Si <- function(i) {
E <- matrix(0, tree_extra$n_states, tree_extra$n_states)
E[i,i] <- 1
return (Q_eigen$vectors %*% E %*% Q_eigen$vectors_inv)
}
# This is part of the second inner loop. We broght it outside
# because... it's R, and you have to do weird stuff to improve performance.
Sij_all <- lapply(1:tree_extra$n_states, gen_Si)
S_partial <- lapply(1:tree_extra$n_states, function(i) Sij_all[[i]] %*% multiplier)
# A matrix where each line corresponds to exp(t*d) for the nth edge
# This is gonna be used inside build_Iij function. It was there before,
# But doing this way makes it faster. Just R stuff...
Ib_all <- do.call(rbind, lapply(tree$edge.length, function(t) exp(d*t)))
for (b in 1:tree_extra$n_edges) {
t <- tree$edge.length[b]
# NOTE: Maybe initialize outside the loop and just zero it inside?
partial <- matrix(0, tree_extra$n_states, tree_extra$n_states)
Ib <- Ib_all[b,]
for (i in 1:tree_extra$n_states) {
# 3. S, which uses the Q_eigen and E, a matrix with zero entries
# except for the iith element, which is 1
# Set new Ei
Si_partial = S_partial[[i]]
for (j in 1:tree_extra$n_states) {
# Set new Ej
Sj <- Sij_all[[j]]
# 4. Iij, which uses the edge length t and the eigen values di,dj
# TODO: This can be moved to outside both for loops
Iij <- build_Iij(Ib, d, i, j, t, tol)
partial <- partial + ((Si_partial %*% Sj) * Iij)
}
}
ret[,,b] <- partial
}
return (ret)
}
# Maybe its better to build the entire matrix with all
# possible values once and just use its values afterwards
build_Iij <- function(Ib, d, i, j, t, tol) {
# NOTE: We are comparing two floats here,
# tolerance got from (Paula Tataru, 2011)
# make.simmap (phytools) accepts this tolerance as a
# parameter, should we have it as a parameter too?
# NOTE: NEVER use all.equal() when not really necessary, it's absurdly
# slow..
diff <- d[i] - d[j]
if (abs(diff) <= tol) {
return ( t * Ib[i] )
} else {
return ( (Ib[i] - Ib[j]) / diff )
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.