#' compare_pangenomes
#'
#'
#' @description Compares the coefficients between two pangenomes datasets by including interaction terms in the GLM.
#'
#' @param fitA a `panfit` object generated by the `panstripe` function
#' @param fitB a `panfit` object generated by the `panstripe` function
#' @param family the family used by glm. One of 'Tweedie', 'Poisson', 'Gamma' or 'Gaussian'. (default='Tweedie')
#' @param intercept whether or not to include an intercept term in the GLM (default=FALSE). Adding an intercept can increase the robustness of the algorithm to errors at higher branches of the phylogeny at the expense of less sensitivity.
#' @param modeldisp whether or not to model the dispersion as a function of the covariates of interest if using a Tweedie family (default=TRUE)
#' @param ci_type the method used to calculate the bootstrap CI (default='perc'). See \link[boot]{boot.ci} for more details.
#' @param conf A scalar indicating the confidence level of the required intervals (default=0.95)
#' @param nboot the number of bootstrap replicates to perform (default=100)
#' @param boot_pvalue whether or not to calculate bootstrap p-values (default=FALSE)
#'
#' @return a list containing a summary of the comparison and the resulting `glm` model object
#'
#' @examples
#'
#' simA <- simulate_pan(rate=1e-4, ngenomes = 200, mean_trans_size=3, fp_error_rate=1)
#' simB <- simulate_pan(rate=1e-3, ngenomes = 200, mean_trans_size=3, fp_error_rate=1)
#' fitA <- panstripe(simA$pa, simA$tree, nboot=0)
#' fitA$summary
#' fitB <- panstripe(simB$pa, simB$tree, nboot=0)
#' fitB$summary
#' comp <- compare_pangenomes(fitA, fitB, nboot=10)
#' comp$summary
#'
#' @export
compare_pangenomes <- function(fitA, fitB, family="Tweedie", intercept=FALSE, modeldisp=TRUE,
ci_type='perc', conf=0.95, nboot=100, boot_pvalue=FALSE){
# input checks
if (class(fitA) != 'panfit') stop('fitA is not of class `panfit`!')
validate_panfit(fitA)
if (class(fitB) != 'panfit') stop('fitB is not of class `panfit`!')
validate_panfit(fitB)
#combine data
dat <- purrr::imap_dfr(list(fitA, fitB), ~{
tibble::add_column(.x$data, pangenome=.y-1, .before=1)
})
# check for all 0's
if ((sum(fitA$data$acc[fitA$data$istip==0])==0) | (sum(fitA$data$acc[fitA$data$istip==1])==0) |
(sum(fitB$data$acc[fitB$data$istip==0])==0) | (sum(fitB$data$acc[fitB$data$istip==1])==0)) {
warning("No gene gains/losses identified at all at internal branches or tips in one or both pangenomes!")
}
# set model
if (intercept){
model <- stats::as.formula("acc ~ 0 + pangenome + istip + core + depth + istip:core + depth:pangenome + istip:pangenome + core:pangenome + istip:core:pangenome")
} else {
model <- stats::as.formula("acc ~ 0 + istip + core + depth + istip:core + depth:pangenome + istip:pangenome + core:pangenome + istip:core:pangenome")
}
# set dispersion model
if (modeldisp){
dmodel <- stats::as.formula("acc ~ pangenome")
} else {
dmodel <- stats::as.formula("acc ~ 1")
}
# fit model
if (family=='Tweedie'){
m <- fit_double_tweedie(model, dmodel=dmodel, data = dat)
a <- anova.dglm.basic(m, tweedie.power = m$p)
} else {
m <- stats::glm(model, dat, family = family)
}
s <- summary(m)$coefficients %>%
tibble::as_tibble(rownames = 'term')
s <- s[s$term %in% c('istip:pangenome', 'core:pangenome', 'depth:pangenome',
'pangenome:istip', 'pangenome:core', 'pangenome:depth'), , drop=FALSE]
s$term <- gsub("pangenome:", "", s$term)
s$term <- gsub(":pangenome", "", s$term)
colnames(s) <- c('term','estimate','std.error','statistic','p.value')
if ((family=='Tweedie') & modeldisp){
s <- s %>% tibble::add_row(
term='dispersion model',
estimate=NA,
std.error=NA,
statistic=a$Chisq,
p.value=a$p.value)
}
# run bootstrap
if (nboot>1){
if (family=='Tweedie'){
boot_reps <- boot::boot(dat, fit_double_model,
R = nboot,
stype='i',
strata=factor(dat$pangenome),
model=model, dmodel=dmodel)
} else {
boot_reps <- boot::boot(dat, fit_model,
R = nboot,
stype='i',
strata=factor(dat$pangenome),
model=model, family=family, boot_type='branch', fit_method='glm')
}
ci <- purrr::map_dfr(which(grepl('^[a-z]+:pangenome$', names(m$coefficients)) |
grepl('^pangenome:[a-z]+', names(m$coefficients))), ~{
df <- as.data.frame(t(boot_ci_pval(boot_reps, index=.x, type=ci_type,
theta_null=0, ci_conf=conf,
transformation='identity', calc_pval = boot_pvalue)))
df$term <- gsub("pangenome:|:pangenome", "", names(m$coefficients)[[.x]])
return(df)
})
s$`bootstrap CI 2.5%` <- signif(as.numeric(ci$V1[match(s$term, ci$term)]), 5)
s$`bootstrap CI 97.5%` <- signif(as.numeric(ci$V2[match(s$term, ci$term)]), 5)
if (boot_pvalue){
s$`bootstrap p-value` <- signif(as.numeric(ci$V3[match(s$term, ci$term)]), 5)
}
} else {
boot_reps <- NULL
s$`bootstrap CI 2.5%` <- NA
s$`bootstrap CI 97.5%` <- NA
}
return(list(
summary=s,
model=m,
data=dat
))
}
double_tweedie_llk <- function(p, model, dmodel, data){
dm <- dglm_mod(model,
dformula = dmodel,
data = data,
family = statmod::tweedie(var.power = p, link.power = 0),
tweedie.var.power=p)
return(dm$m2loglik)
}
fit_double_tweedie <- function(model, dmodel, data){
fm <- tryCatch(
{
op <- stats::optimise(double_tweedie_llk, lower = 1.01, upper = 1.99, model=model, dmodel=dmodel, data=data)
tm <- dglm_mod(formula = model,
dformula = dmodel,
data = data,
family = statmod::tweedie(var.power = op$minimum, link.power = 0),
tweedie.var.power=op$minimum)
# check convergence
ftm <- stats::glm(model, data, family = statmod::tweedie(var.power = op$minimum, link.power = 0))
tm$p <- op$minimum
tm$converged <- ftm$converged
tm
},
error=function(cond) {
stop(
"Panstripe model fit failed! This can sometime be caused by unusual branch lengths.
Setting family='quasipoisson' or 'gaussian' often provides a more stable fit to difficult datasets"
)
}
)
if (!fm$converged) {
warning(
"Panstripe model fit failed to converge!
Setting family='gaussian' often provides a more stable fit to difficult datasets"
)
}
return(fm)
}
fit_double_model <- function(d, indices=NULL, model, dmodel){
stopifnot(length(indices)==nrow(d))
tdat <- d[indices,]
tm <- fit_double_tweedie(model, dmodel = dmodel, data = tdat)
coef <- c(tm$coefficients, tm$dispersion.fit$coefficients)
return(coef)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.