Nothing
## In contrast with the other functions in diversitree, we're going to
## use a formula interface here; this will be a little more cumbersome
## for two parameters.
## The "contrasts" based approach comes from Freckleton 2012; thanks
## to Rob Freckleton for clarifying the approach during implementation.
make.pgls <- function(tree, formula, data, control=list()) {
control <- check.control.pgls(control)
cache <- make.cache.pgls(tree, formula, data, control)
if (control$method == "vcv")
all_branches <- make.all_branches.pgls.vcv(cache, control)
else if (control$method == "pruning")
all_branches <- make.all_branches.pgls.pruning(cache, control)
else if (control$method == "contrasts")
all_branches <- make.all_branches.pgls.contrasts(cache, control)
check.pars <- make.check.pars.pgls(cache$info$np)
ll <- function(pars) {
check.pars(pars)
all_branches(pars)
}
## NOTE: using pgls.dt here to distinguish this from caper's pgls
## class so that 'print' works.
class(ll) <- c("pgls.dt", "dtlik", "function")
ll
}
make.cache.pgls <- function(tree, formula, data, control) {
tree <- check.tree(tree, ultrametric=FALSE)
cache <- check.data.pgls(tree, formula, data)
cache$n <- length(tree$tip.label)
## Hard coded for BM, for now:
cache$phylo.model.info <- make.info.bm(NULL)
if (control$method == "vcv") {
cache$vcv <- vcv(tree)
cache$vcv.inverse <- solve(cache$vcv)
cache$vcv.logdet <- determinant(cache$vcv, logarithm=TRUE)$modulus
## Rename in order to match up with the equations better:
cache$X <- cache$predictors
cache$Y <- cache$response
} else if (control$method == "pruning") {
cache <- c(cache, make.cache(tree))
cache$states.sd <- rep(0, cache$n) # require no error
cache$X <- cache$predictors
cache$Y <- cache$response
} else { # contrasts
pred <- cache$predictors[,-1,drop=FALSE]
cache$u.x <- apply(pred, 2, pic, tree)
cache$u.y <- pic(cache$response, tree)
cache$V <- pic(cache$predictors[,1], tree,
var.contrasts=TRUE)[,"variance"]
cache$V0 <- pgls.root.var.bm(tree)
cache$root.x <- apply(pred, 2, pgls.root.mean.bm, tree=tree)
cache$root.y <- pgls.root.mean.bm(tree, cache$response)
}
cache$info <- make.info.pgls(tree, cache$predictors,
cache$phylo.model.info)
cache
}
make.info.pgls <- function(phy, predictors, phylo.model.info) {
list(name="pgls",
name.pretty="PGLS",
## Parameters:
np=ncol(predictors) + phylo.model.info$np,
argnames=default.argnames.pgls(predictors),
## Variables:
ny=3L, # needed for pruning, ignored elsewhere
k=NA,
idx.e=NA,
idx.d=NA,
## Phylogeny:
phy=phy,
## Inference:
ml.default="subplex",
mcmc.lowerzero=FALSE,
## These are optional
doc=NULL,
## Need some other references in here, too.
reference=c(
"Freckleton R.P. 2012 Methods in Ecology and Evolution 3:940-947"))
}
default.argnames.pgls <- function(predictors)
c(colnames(predictors), "s2")
make.all_branches.pgls.vcv <- function(cache, control) {
X <- cache$X
Y <- cache$Y
n <- cache$n
vcv.logdet <- as.numeric(cache$vcv.logdet)
vcv.inverse <- cache$vcv.inverse
## This is a direct implementation of equation 15 in Freckleton
## 2012; the only modification is that we make the translation
## log |s2 V| = n log (s2) + log |V|
## so that we don't recompute the determinant, and
## (s2 V)^{-1} = V^{-1} / s2
## so that we don't recompute the inverse.
## It's possible that the
## t(z) %*% (vcv.inverse / s2)
## could be replaced by
## crossprod(z, vcv.inverse/s2)
## for a small speed saving, but not bothering right now.
##
## There are also constants (nlog(2pi) + vcv.logdet) that can be
## precomputed easily enough.
function(pars) {
b <- pars[-length(pars)]
s2 <- pars[[length(pars)]]
z <- as.numeric(Y - X %*% b)
-(n * log(2 * pi) +
n * log(s2) + vcv.logdet +
as.numeric(t(z) %*% (vcv.inverse / s2) %*% (z))) / 2
}
}
make.all_branches.pgls.pruning <- function(cache, control) {
X <- cache$X
Y <- cache$Y
## Indices to the parameter vector for the linear and phylogenetic
## part of the model.
i.linear <- seq_len(cache$info$np - cache$phylo.model.info$np)
i.phylo <- -i.linear
all_branches <- make.all_branches.pgls.pruning.phylo(cache, control)
function(pars) {
b <- pars[i.linear]
z <- as.numeric(Y - X %*% b)
res <- all_branches(pars[i.phylo], z)
rootfunc.pgls.pruning(res)
}
}
make.all_branches.pgls.contrasts <- function(cache, control) {
u.x <- cache$u.x
u.y <- cache$u.y
np <- cache$info$np
n <- cache$n
V <- cache$V
V0 <- cache$V0
root.x <- cache$root.x
root.y <- cache$root.y
## Note that n*log(2 * pi) + sum(log(V)) + log(V0) can be factored
## out; this is constant every step. So we'll end up with something
## that looks like:
## const <- - 0.5 * (n * log(2 * pi + sum(log(V)) + log(V0)))
## const +
## -0.5 * (n * log(s2) +
## sum((u.y - u.x %*% b)^2) / s2 +
## (intercept - (root.y - b %*% root.x))^2 / (s2 * V0))
function(pars) {
intercept <- pars[[1]]
b <- pars[-c(1, np)]
s2 <- pars[[np]]
ll <- -(n * log(2 * pi) +
n * log(s2) +
sum(log(V)) +
log(V0) +
sum((u.y - u.x %*% b)^2) / s2) / 2
dll <- -0.5 * (intercept - (root.y - b %*% root.x))^2 / (s2 * V0)
as.numeric(ll + dll)
}
}
check.data.pgls <- function(tree, formula, data, allow.unnamed=FALSE) {
if (!inherits(formula, "formula"))
stop("'formula' must be a formula object")
if (!is.data.frame(data))
stop("'data' must be a data.frame")
if (is.null(rownames(data))) {
if (allow.unnamed) {
if (nrow(data) == length(tree$tip.label)) {
rownames(data) <- tree$tip.label
warning("Assuming data rows are in tree$tip.label order")
} else {
stop(sprintf("Invalid data length (expected %d rows)",
length(tree$tip.label)))
}
} else {
stop("'data' must contain row names")
}
}
if (!all(tree$tip.label %in% rownames(data)))
stop("Not all species have state information")
## Reorder the data so it matches the tree
data <- data[match(rownames(data), tree$tip.label),]
## For now, missing data will straight up cause a fail.
m <- model.frame(formula, data, na.action=na.fail)
predictors <- model.matrix(formula, m)
response <- structure(m[[1]], names=rownames(m))
list(tree=tree, formula=formula, data=data,
response=response, predictors=predictors)
}
make.check.pars.pgls <- function(k) {
function(pars) {
if (length(pars) != k)
stop("Incorrect paramter length")
if (pars[[k]] <= 0)
stop("Diffusion coefficient must be greater than zero")
invisible(TRUE)
}
}
## Simple control checking.
##
## TODO: Allow a switch that would give us REML estimates? What would
## that mean in the context of Bayesian inference? See Rob's email
## for some direction here.
check.control.pgls <- function(control) {
defaults <- list(method="vcv", REML=FALSE, backend="R")
control <- modifyList(defaults, control)
if ( length(control$method) != 1 )
stop("control$method must be a scalar")
methods <- c("vcv", "pruning", "contrasts")
if (!(control$method %in% methods))
stop(sprintf("control$method must be in %s",
paste(methods, collapse=", ")))
if (control$method == "pruning") { # no need to check otherwise
if ( length(control$backend) != 1 )
stop("control$backend must be a scalar")
backends <- c("C", "R")
if (!(control$backend %in% backends))
stop(sprintf("control$backend must be in %s",
paste(backends, collapse=", ")))
}
control
}
pgls.root.var.bm <- function(tree) {
n.spp <- length(tree$tip.label)
x <- rep(1, n.spp)
tmp <- pic(x, tree, var.contrasts=TRUE, rescaled.tree=TRUE)
idx <- tmp$rescaled.tree$edge[,1] == n.spp + 1
root.v <- tmp$rescaled.tree$edge.length[idx]
prod(root.v)/sum(root.v)
}
## Could probably do this better; this is a pretty heavyweight way of
## doing this calculation. A better way might be to get this in much
## the same way that pgls.root.var.bm works. However, that uses
## internal features of ape's pic function that I'd rather not go
## near.
##
## TODO: This is going to be worse when we are trying to do this on a
## rescaled tree. I wonder if we can pull this from the contrasts
## instead?
pgls.root.mean.bm <- function(tree, states) {
lik <- make.bm(tree, states, control=list(method="pruning", backend="C"))
attr(lik(1, intermediates=TRUE), "vals")[[1]]
}
fitted.pgls.dt <- function(object, p, ...) {
b <- p[-length(p)]
cache <- get.cache(object)
X <- cache$predictors
X %*% b
}
fitted.fit.mle.pgls <- function(object, ...) {
fitted(get.likelihood(object), stats::coef(object, ...))
}
fitted.mcmcsamples.pgls <- function(object, ...) {
p <- stats::coef(object, ...)
lik <- get.likelihood(object)
ret <- apply(p, 1, function(x) fitted(lik, x))
rownames(ret) <- rownames(get.cache(lik)$predictors)
colnames(ret) <- NULL
ret
}
## Residuals follow from the fitted:
residuals.pgls.dt <- function(object, p, ...) {
get.cache(object)$response - fitted(object, p, ...)
}
residuals.fit.mle.pgls <- function(object, ...) {
get.cache(get.likelihood(object))$response - fitted(object, ...)
}
residuals.mcmcsamples.pgls <- residuals.fit.mle.pgls
## This organises wrapping around the normal "all_branches" functions,
## from the point of view of:
##
## * only some of the parameters belong to a phylogenetic model
## * the data change between calls.
##
## We will take parameters (all of them, and we'll subset down) and a
## vector of residuals to act as new data. Then we'll shepherd
## through the appropriate phylogenetic model. For now this will just
## be BM, but eventually will be OU, EB and lambda too.
##
## To make that make sense, we need extra elements in the cache
## object.
make.all_branches.pgls.pruning.phylo <- function(cache, control) {
## Dummy variable, needed for initial.tip.bm.pruning()
cache$states <- cache[["Y"]]
## This is needed by make.all_branches.continuous to set up
## initial tip memory:
cache$y <- initial.tip.bm.pruning(cache)
## Then, we build things based on the *phylo* model, not the
## container.
cache$info <- cache$phylo.model.info
all_branches <- make.all_branches.bm.pruning(cache, control)
if (control$backend == "R") {
y <- get.cache(all_branches)$y$y
idx <- match(seq_len(cache$n), cache$y$target)
function(pars, residuals) {
y[1,idx] <- residuals
environment(all_branches)$cache$y$y <- y
all_branches(pars)
}
} else { # backend "C"
ptr <- environment(all_branches)$ptr
## NOTE: We need the mangled version here - differently ordered!
y <- environment(all_branches)$cache.C$y$y
function(pars, residuals) {
y[1,] <- residuals
.Call(r_dt_cont_reset_tips, ptr, y)
all_branches(pars)
}
}
}
rootfunc.pgls.pruning <- function(res) {
ll <- rootfunc.bm.pruning(res, NA, ROOT.MAX, NA, FALSE)
## in comparion with model.pgls:
## res$vals[[1]] is equal to intercept - (root.y - b %*% root.x)
## res$vals[[2]] is equal to s2 * V0
## I think I could probably do this better through root.x though,
## but not sure how and don't think it matters that much.
dll <- -0.5 * res$vals[[1]]^2 / res$vals[[2]]
ll + dll
}
## TODO: nlme::gls returns a vector of residuals, but caper::pgls
## returns a 1-column matrix. What should we return? I don't like
## the 1 column matrix much!
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.