#' panstripe
#'
#'
#' @description Fits a Tweedie distribution to the inferred gene gain and loss events along each branch of a given phylogeny.
#' Includes covariates to control for the impact of annotation errors and the depth of ancestral branches.
#'
#' @param pa a binary gene presence/absence matrix with genes as columns and genomes as rows. The rownames should match the tip.labels of the corresponding phylogeny.
#' @param tree a core gene phylogeny of class \link{phylo}
#' @param asr_method method used to perform ancestral state reconstruction. Can be either 'max.parsimony' (default), 'max.likelihood', or 'stochastic.map'
#' @param min_depth the minimum depth of a branch to be included in the regression. All branches are included by default.
#' @param family the family used by glm. One of 'Tweedie', 'Poisson', 'Gamma' or 'Gaussian'. (default='Tweedie')
#' @param intercept whether or not to include an intercept term in the GLM (default=FALSE). Adding an intercept can increase the robustness of the algorithm to errors at higher branches of the phylogeny at the expense of less sensitivity.
#' @param fit_method the method used to fit the GLM. Can be either 'glm' (default) which uses base R and the 'tweedie' package or 'glmmTMB' which uses Template Model Builder.
#' @param ci_type the method used to calculate the bootstrap CI (default='perc'). See \link[boot]{boot.ci} for more details.
#' @param nboot the number of bootstrap replicates to perform (default=100)
#' @param boot_type whether to resample by 'branch', the default, or by 'gene'
#' @param conf A scalar indicating the confidence level of the required intervals (default=0.95).
#' @param boot_pvalue whether or not to calculate bootstrap p-values (default=FALSE)
#' @param return_anc_states return the calculated branch level ancestral state matrix
#' @param quiet whether to print progress information (default=FALSE)
#'
#' @return a panfit object with the resulting parameter estimates and bootstrap replicates
#'
#' @examples
#'
#' sim <- simulate_pan(rate=5e-4, mean_trans_size=3, fn_error_rate=2, fp_error_rate=2)
#' pa <- sim$pa
#' tree <- sim$tree
#' nboot <- 100
#' family <- 'Tweedie'
#' ci_type='perc'
#' boot_type='branch'
#' conf=0.95
#' asr_method="stochastic.map"
#' res <- panstripe(sim$pa, sim$tree, nboot=100)
#' res$summary
#' res <- panstripe(sim$pa, sim$tree, nboot=100, fit_method='glmmTMB', family='gaussian')
#' res$summary
#'
#' @export
panstripe <- function(pa, tree,
asr_method="max.parsimony",
min_depth=NULL,
family='Tweedie',
intercept=FALSE,
fit_method='glm',
ci_type='perc',
nboot=1000,
boot_type='branch',
conf=0.95,
boot_pvalue=FALSE,
return_anc_states=FALSE,
quiet=FALSE){
#check inputs
if (nrow(pa) != length(tree$tip.label)) stop('number of taxa in tree and p/a matrix need to be the same!')
if(!all(rownames(pa) %in% tree$tip.label)) stop('taxa labels in p/a matrix and tree do not match!')
if(!asr_method %in% c('max.parsimony', 'max.likelihood', 'stochastic.map')) stop("asr_method must be one of 'max.parsimony', 'max.likelihood' or 'stochastic.map'")
if (!(is.character(family) && (family=="Tweedie"))){
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (is.null(family$family)) {
print(family)
stop("'family' not recognized")
}
}
pa <- pa[match(tree$tip.label, rownames(pa)),,drop=FALSE]
index <- which(apply(pa, 2, function(x) length(unique(x)))>1)
if (asr_method=='max.likelihood'){
# use maximum likelihood
anc_states <- do.call(cbind, purrr::map(index, ~{
a <- ape::ace(x = pa[,.x], phy = tree, type = 'discrete')
mx <- c(pa[,.x], apply(a$lik.anc, 1, which.max)-1)
return(1*(mx[tree$edge[,2]]!=mx[tree$edge[,1]]))
}))
} else if (asr_method=='stochastic.map') {
sf <- panstripe:::sfreemap(tree, pa = pa[,index], model = "ER", quiet=quiet)
anc_states <- exp(sf$mapped.edge.lmt)
} else {
# use maximum parsimony
anc_states <- do.call(cbind, purrr::map(index, ~{
return(panstripe:::asr_max_parsimony(tree, pa[,.x]+1, Nstates = 2)$change[tree$edge[,2]])
}))
}
dat <- tibble::tibble(
acc=rowSums(anc_states, na.rm = TRUE),
core=tree$edge.length,
istip=as.numeric(tree$edge[,2]<=length(tree$tip.label) )
)
# add depth and filter if requested
dat$depth <- ape::node.depth.edgelength(tree)[tree$edge[,1]]
if (!is.null(min_depth)){
dat <- dat[dat$depth>=min_depth,]
}
# check for all 0's
if (sum(dat$acc[dat$istip==0])==0) {
warning("No gene gains/losses identified on internal branches! Separation may be a problem.")
} else if (sum(dat$acc[dat$istip==1])==0) {
warning("No gene gains/losses identified at phylogeny tips! Separation may be a problem.")
}
# set model
if (intercept){
model <- stats::as.formula("acc ~ istip + core + depth + istip:core")
} else {
model <- stats::as.formula("acc ~ 0 + istip + core + depth + istip:core")
}
# fit model
if (is.character(family) && (family=="Tweedie")){
m <- fit_tweedie(model, data = dat, method=fit_method)
coef <- c(m$coefficients, m$p, m$phi)
} else {
if (fit_method=='glm'){
m <- stats::glm(model, data = dat, family=family)
} else {
m <- glmmTMB::glmmTMB(model, data = dat, family=family)
m$coefficients <- m$fit$par[names(m$fit$par)=='beta']
}
coef <- c(m$coefficients, NA, NA)
}
sm <- summary(m)
if (any(grepl('glmmTMB', m$call))) sm$coefficients <- sm$coefficients$cond
if (intercept){
coef_names <- c('intercept', strsplit(as.character(model), ' \\+ ')[[3]])
} else {
coef_names <- strsplit(as.character(model), ' \\+ ')[[3]][-1]
}
s <- tibble::tibble(
term = c(coef_names, 'p', 'phi'),
estimate = coef,
std.error=c(sm$coefficients[,2], NA, NA),
statistic=c(sm$coefficients[,3], NA, NA),
p.value = c(sm$coefficients[,4], NA, NA)
)
# run bootstrap
if (nboot>1){
if (boot_type=='gene'){
boot_reps <- boot::boot(t(anc_states), fit_model,
R = nboot,
stype='i',
tree=tree, min_depth=min_depth, model=model, family=family, fit_method=fit_method, boot_type='gene')
} else {
boot_reps <- boot::boot(dat, fit_model,
R = nboot,
stype='i',
tree=tree, min_depth=min_depth, model=model, family=family, fit_method=fit_method, boot_type='branch')
}
# calculate CIs and p-values
mindex <- length(coef_names)
if (is.character(family) && (family=="Tweedie")) mindex <- mindex + 2
ci <- do.call(rbind, purrr::map(1:mindex, ~{
transformation <- 'identity'
if (s$term[[.x]]=='p') transformation <- 'logit'
if (s$term[[.x]]=='phi') transformation <- 'log'
boot_ci_pval(boot_reps, index=.x, type=ci_type,
theta_null=0, ci_conf=conf,
transformation=transformation,
calc_pval = boot_pvalue)
}))
if (is.character(family) && (family=="Tweedie")){
s$`bootstrap CI 2.5%` <- signif(as.numeric(ci[,1]), 5)
s$`bootstrap CI 97.5%` <- signif(as.numeric(ci[,2]), 5)
if (boot_pvalue){
s$`bootstrap p-value` <- c(signif(as.numeric(ci[,3]), 5))
}
} else {
s$`bootstrap CI 2.5%` <- c(signif(as.numeric(ci[,1]), 5), NA, NA)
s$`bootstrap CI 97.5%` <- c(signif(as.numeric(ci[,2]), 5), NA, NA)
if (boot_pvalue){
s$`bootstrap p-value` <- c(signif(as.numeric(ci[,3]), 5), NA, NA)
}
}
} else {
s$`bootstrap CI 2.5%` <- NA
s$`bootstrap CI 97.5%` <- NA
boot_reps <- NULL
}
if (!return_anc_states){
anc_states = NULL
}
return(
new_panfit(
summary = s,
model = m,
data = dat,
boot_samples=boot_reps,
tree=tree,
pa=pa,
anc_states=anc_states
)
)
}
twd_llk <- function(p, model, data) {
# suppressWarnings({
tm <- stats::glm(model, data, family = statmod::tweedie(var.power = p, link.power = 0))
# })
tsm <- summary(tm)
-sum(log(tweedie::dtweedie(y = tm$y, mu = tm$fitted.values, phi = tsm$dispersion, power = p)))
}
fit_tweedie <- function(model, data, method='glm'){
if (method=='glm'){
fm <- tryCatch(
{
op <- stats::optimise(twd_llk, lower = 1, upper = 2, model=model, data=data)
tm <- stats::glm(model, data, family = statmod::tweedie(var.power = op$minimum, link.power = 0))
stm <- summary(tm)
tm$p <- op$minimum
tm$phi <- stm$dispersion
tm
},
error=function(cond) {
stop(
"Panstripe model fit failed! This can sometime be caused by unusual branch lengths.
Setting fit_method='glmmTMB' or family='quasipoisson' or 'gaussian' often provides a more stable fit to difficult datasets"
)
}
)
} else {
fm <- glmmTMB::glmmTMB(model, data = data, family = glmmTMB::tweedie)
fm$coefficients <- fm$fit$par[1:(length(fm$fit$par)-2)]
fm$p <- min(1.99, max(1.01, exp(1+fm$fit$par[[length(fm$fit$par)]])))
fm$phi <- exp(fm$fit$par[[length(fm$fit$par)-1]])
if (fm$fit$convergence!=0){
warning(
"Panstripe model fit failed to converge!
Setting family='quasipoisson' or 'gaussian' often provides a more stable fit to difficult datasets"
)
}
}
return(fm)
}
fit_model <- function(d, indices=1:nrow(d), tree=NULL, min_depth=NULL, model, family, fit_method, boot_type){
stopifnot(length(indices)==nrow(d))
stopifnot(boot_type %in% c('branch', 'gene'))
# sometimes it is hard to get the model to converge for a particular sample. We attempt a few which hopefully does not bias things too much.
coef <- NULL
attempt <- 0
max_attempt <- 5
while(is.null(coef) && (attempt<=max_attempt)){
attempt <- attempt + 1
try({
if (boot_type == 'branch') {
tdat <- d[indices,]
} else {
tdat <- tibble::tibble(
acc=colSums(d[indices,]),
core=tree$edge.length,
istip=tree$edge[,2]<=length(tree$tip.label)
)
# add depth
tdat$depth <- ape::node.depth.edgelength(tree)[tree$edge[,2]]
if (!is.null(min_depth)){
tdat <- tdat[tdat$min_depth>=min_depth,]
}
}
if (is.character(family) && (family=="Tweedie")){
tm <- fit_tweedie(model, data = tdat, method=fit_method)
coef <- c(tm$coefficients, tm$p, tm$phi)
} else {
if (fit_method=='glm'){
tm <- stats::glm(model, data = tdat, family=family)
} else {
tm <- glmmTMB::glmmTMB(model, data = tdat, family=family)
tm$coefficients <- tm$fit$par[names(tm$fit$par)=='beta']
}
coef <- c(tm$coefficients, NA, NA)
}
})
}
if (is.null(coef) && (attempt==max_attempt+1)){
stop('Model fitting failed to converge in bootstrap replicate!')
}
return(coef)
}
boot_ci_pval <- function(boot_res, index, type,
theta_null=0,
precision=NULL,
ci_conf=0.95,
transformation='identity',
calc_pval=FALSE) {
if (!transformation %in% c('identity','logit','log')) stop('Invalid transformation!')
if (is.null(precision)){
precision = 1/boot_res$R
}
if (transformation=='logit'){
ll <- stats::make.link('logit')
h <- function(x) ll$linkfun(x-1)
hinv <- function(x) ll$linkinv(x)+1
hdot <- function(x) ll$mu.eta(x)
} else if (transformation=='log'){
ll <- stats::make.link('log')
h <- function(x) ll$linkfun(x)
hinv <- function(x) ll$linkinv(x)
hdot <- function(x) ll$mu.eta(x)
} else {
h <- function(x) x
hinv <- function(x) x
hdot <- function(x) 1
}
if (calc_pval){
#calculate p-value by inverting the corresponding confidence intervals, as described in Section 3.12 in Hall (1992)
alpha_seq <- seq(precision, 1 - 1e-16, precision)
ci <- suppressWarnings({boot::boot.ci(boot_res, conf = 1 - alpha_seq,
type = type, index=index,
h = h, hinv = hinv, hdot = hdot)})
bounds <- switch(type,
norm = ci$normal[, 2:3],
basic = ci$basic[,4:5],
stud = ci$student[, 4:5],
perc = ci$percent[, 4:5],
bca = ci$bca[, 4:5])
alpha <- alpha_seq[which.min(theta_null >= bounds[, 1] &
theta_null <= bounds[, 2])]
}
#calculate CI
ci <- suppressWarnings({boot::boot.ci(boot_res, conf = ci_conf,
type = type, index=index,
h = h, hinv = hinv, hdot = hdot)})
bounds <- switch(type,
norm = ci$normal[, 2:3],
basic = ci$basic[,4:5],
stud = ci$student[, 4:5],
perc = ci$percent[, 4:5],
bca = ci$bca[, 4:5])
if (calc_pval){
return(c(sort(bounds), alpha))
} else {
return(sort(bounds))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.