#' compare_pangenome_covariates
#'
#' Tests whether covariates associated with different pangenomes are significantly associated with gene gain and loss or errors.
#'
#' @param fits a list of `panfit` objects generated by the `panstripe` function
#' @param covariates a `data.frame` object generated where the first column matches the names in the list of pangenomes. Covariates to be tested are given in subsequent columns.
#' @param family the family used by glm. One of 'Tweedie', 'Poisson', 'Gamma' or 'Gaussian'. (default='Tweedie')
#' @param keep a vector of a subset of the column names in the covariate data.frame. (default='all')
#' @param modeldisp whether or not to model the dispersion as a function of the covariates of interest if using a Tweedie family (default=FALSE)
#' @param ci_type the method used to calculate the bootstrap CI (default='bca'). See \link[boot]{boot.ci} for more details.
#' @param conf A scalar indicating the confidence level of the required intervals (default=0.95)
#' @param nboot the number of bootstrap replicates to perform (default=100)
#'
#' @return a list containing a summary of the comparison and the resulting `glm` model object
#'
#' @examples
#'
#' simA <- simulate_pan(rate=1e-4, ngenomes = 50, fn_error_rate=1, fp_error_rate=1)
#' simB <- simulate_pan(rate=1e-3, ngenomes = 200, fn_error_rate=1, fp_error_rate=1)
#' simC <- simulate_pan(rate=5e-3, ngenomes = 100, fn_error_rate=1, fp_error_rate=1)
#' tfits <- purrr::map(list(A=simA, B=simB, C=simC), ~{
#' panstripe(.x$pa, .x$tree, nboot=10, ci_type='perc')
#' })
#'
#' covariates <- tibble::tibble(
#' pangenome=c('A','B','C','E','F','G'),
#' dummy=c(1,2,3,1,2,2)
#' )
#' fits <- c(tfits, list(E=tfits[[1]], F=tfits[[2]], G=tfits[[3]]))
#' comp <- compare_pangenome_covariates(fits, covariates, modeldisp=TRUE)
#' comp$summary
#'
#' @export
compare_pangenome_covariates <- function(fits, covariates, family='Tweedie', keep='all',
modeldisp=FALSE, ci_type='norm', conf=0.95, nboot=100){
# input checks
if (class(fits) != 'list') stop('fits must be a list of panfit objects!')
if (any(purrr::map_lgl(fits, ~ class(.x) != 'panfit'))) stop('fits must be a list of panfit objects!')
purrr::walk(fits, ~ validate_panfit(.x))
if ((nrow(covariates)!=length(fits)) | (!all(unlist(covariates[,1]) %in% names(fits)))) stop(
'first column of covariates data.frame must match fits list names!')
if (keep=='all'){
keep <- colnames(covariates)[2:ncol(covariates)]
} else {
if (!all(keep %in% colnames(covariates))) stop('keep must be a subset of the column names in the covariates data.frame!')
}
#combine data
dat <- purrr::imap_dfr(fits, ~{
tibble::add_column(.x$data, pangenome=.y, .before=1)
})
covariates <- covariates[match(dat$pangenome, unlist(covariates[,1])),]
dat <- cbind(dat, covariates[,2:ncol(covariates),drop=FALSE])
# fit model
model <- stats::as.formula(paste(c("acc ~ 0 + istip + core + depth",
"istip:core",
paste('istip', keep, sep=':'),
paste('core', keep, sep=':'),
paste('depth', keep, sep=':'),
paste('istip:core', keep, sep=':')), collapse = ' + '))
if(modeldisp){
dmodel <- stats::as.formula(paste(c("acc ~ 1" , keep), collapse = ' + '))
} else {
dmodel <- stats::as.formula("acc ~ 1")
}
if (family=="Tweedie"){
m <- fit_double_tweedie(model = model, data = dat, dmodel = dmodel)
a <- anova.dglm.basic(m, tweedie.power = m$p)
} else {
m <- stats::glm(model, dat, family = family)
}
s <- summary(m)$coefficients %>%
tibble::as_tibble(rownames = 'term')
coef <- s$term
s <- s[!s$term %in% c('(Intercept)', 'istip', 'core', 'depth', 'istip:core'), , drop=FALSE]
indices <- which(coef %in% s$term)
s$term <- gsub("TRUE", "", s$term)
colnames(s) <- c('term','estimate','std.error','statistic','p.value')
if ((family=='Tweedie') & modeldisp){
s <- s %>% tibble::add_row(
term=paste('dispersion model:', keep),
estimate=NA,
std.error=NA,
statistic=a$Chisq,
p.value=a$p.value)
}
# run bootstrap
if (nboot>1){
if (family=='Tweedie'){
boot_reps <- boot::boot(dat, fit_double_model,
strata = factor(dat$pangenome),
R = nboot,
stype='i',
model=model, dmodel=dmodel)
} else {
boot_reps <- boot::boot(dat, fit_model,
strata = factor(dat$pangenome),
R = nboot,
stype='i',
model=model, family=family, boot_type='branch')
}
ci <- purrr::map_dfr(indices, ~{
df <- tibble::as_tibble(t(boot_ci_pval(boot_reps, index=.x, type=ci_type,
theta_null=0, ci_conf=conf,
transformation='identity')))
df$term <- gsub("TRUE", "", names(m$coefficients)[[.x]])
return(df)
})
s$`bootstrap CI 2.5%` <- signif(ci$V1, 5)[match(s$term, ci$term)]
s$`bootstrap CI 97.5%` <- signif(ci$V2, 5)[match(s$term, ci$term)]
} else {
s$`bootstrap CI 2.5%` <- NA
s$`bootstrap CI 97.5%` <- NA
}
return(list(
summary=s,
model=m
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.