panstripe: Pangenome Plots

Documented in compare_pangenome_covariates

#' compare_pangenome_covariates
#'
#' Tests whether covariates associated with different pangenomes are significantly associated with gene gain and loss or errors.
#'
#' @param fits a list of `panfit` objects generated by the `panstripe` function
#' @param covariates a `data.frame` object generated where the first column matches the names in the list of pangenomes. Covariates to be tested are given in subsequent columns.
#' @param family the family used by glm. One of 'Tweedie', 'Poisson', 'Gamma' or 'Gaussian'. (default='Tweedie')
#' @param keep a vector of a subset of the column names in the covariate data.frame. (default='all')
#' @param modeldisp whether or not to model the dispersion as a function of the covariates of interest if using a Tweedie family (default=FALSE)
#' @param ci_type the method used to calculate the bootstrap CI (default='bca'). See \link[boot]{boot.ci} for more details.
#' @param conf A scalar indicating the confidence level of the required intervals (default=0.95)
#' @param nboot the number of bootstrap replicates to perform (default=100)
#' 
#' @return a list containing a summary of the comparison and the resulting `glm` model object
#'
#' @examples
#'
#' simA <- simulate_pan(rate=1e-4, ngenomes = 50, fn_error_rate=1, fp_error_rate=1)
#' simB <- simulate_pan(rate=1e-3, ngenomes = 200, fn_error_rate=1, fp_error_rate=1)
#' simC <- simulate_pan(rate=5e-3, ngenomes = 100, fn_error_rate=1, fp_error_rate=1)
#' tfits <- purrr::map(list(A=simA, B=simB, C=simC), ~{
#'   panstripe(.x$pa, .x$tree, nboot=10, ci_type='perc')
#' })
#' 
#' covariates <- tibble::tibble(
#'   pangenome=c('A','B','C','E','F','G'),
#'   dummy=c(1,2,3,1,2,2)
#' )
#' fits <- c(tfits, list(E=tfits[[1]], F=tfits[[2]], G=tfits[[3]]))
#' comp <- compare_pangenome_covariates(fits, covariates, modeldisp=TRUE)
#' comp$summary
#'
#' @export
compare_pangenome_covariates <- function(fits, covariates, family='Tweedie', keep='all', 
                                         modeldisp=FALSE, ci_type='norm', conf=0.95, nboot=100){
  
  # input checks
  if (class(fits) != 'list') stop('fits must be a list of panfit objects!')
  if (any(purrr::map_lgl(fits, ~ class(.x) != 'panfit'))) stop('fits must be a list of panfit objects!')
  purrr::walk(fits, ~ validate_panfit(.x))
  
  if ((nrow(covariates)!=length(fits)) | (!all(unlist(covariates[,1]) %in% names(fits)))) stop(
    'first column of covariates data.frame must match fits list names!')
  
  if (keep=='all'){
    keep <- colnames(covariates)[2:ncol(covariates)]
  } else {
    if (!all(keep %in% colnames(covariates))) stop('keep must be a subset of the column names in the covariates data.frame!')
  }
  
  #combine data
  dat <- purrr::imap_dfr(fits, ~{
    tibble::add_column(.x$data, pangenome=.y, .before=1)
  })
  
  covariates <- covariates[match(dat$pangenome, unlist(covariates[,1])),]
  dat <- cbind(dat, covariates[,2:ncol(covariates),drop=FALSE])
  
  # fit model
  model <- stats::as.formula(paste(c("acc ~ 0 + istip + core + depth",
                                     "istip:core",
                                     paste('istip', keep, sep=':'),
                                     paste('core', keep, sep=':'),
                                     paste('depth', keep, sep=':'),
                                     paste('istip:core', keep, sep=':')), collapse = ' + '))
  
  if(modeldisp){
    dmodel <- stats::as.formula(paste(c("acc ~ 1" , keep), collapse = ' + ')) 
  } else {
    dmodel <- stats::as.formula("acc ~ 1")  
  }
  
  if (family=="Tweedie"){
    m <- fit_double_tweedie(model = model, data = dat, dmodel = dmodel)
    a <- anova.dglm.basic(m, tweedie.power = m$p)
  } else {
    m <- stats::glm(model, dat, family = family)
  }
  
  s <- summary(m)$coefficients %>% 
    tibble::as_tibble(rownames = 'term')
  coef <- s$term
  
  s <- s[!s$term %in% c('(Intercept)', 'istip', 'core', 'depth', 'istip:core'), , drop=FALSE]
  indices <- which(coef %in% s$term)
  s$term <- gsub("TRUE", "", s$term)
  colnames(s) <- c('term','estimate','std.error','statistic','p.value')
  
  if ((family=='Tweedie')  & modeldisp){
    s <- s %>% tibble::add_row(
      term=paste('dispersion model:', keep),
      estimate=NA,
      std.error=NA,
      statistic=a$Chisq,
      p.value=a$p.value)
  }
  
  # run bootstrap
  if (nboot>1){
    if (family=='Tweedie'){
      boot_reps <- boot::boot(dat, fit_double_model,
                              strata = factor(dat$pangenome),
                              R = nboot,
                              stype='i',
                              model=model, dmodel=dmodel)
    } else {
      boot_reps <- boot::boot(dat, fit_model,
                              strata = factor(dat$pangenome),
                              R = nboot,
                              stype='i',
                              model=model, family=family, boot_type='branch')
    }
    
    ci <- purrr::map_dfr(indices, ~{
      df <- tibble::as_tibble(t(boot_ci_pval(boot_reps, index=.x, type=ci_type,
                                             theta_null=0, ci_conf=conf,
                                             transformation='identity')))
      df$term <- gsub("TRUE", "", names(m$coefficients)[[.x]])
      return(df)
    })
    
    s$`bootstrap CI 2.5%` <- signif(ci$V1, 5)[match(s$term, ci$term)]
    s$`bootstrap CI 97.5%` <- signif(ci$V2, 5)[match(s$term, ci$term)]
  } else {
    s$`bootstrap CI 2.5%` <- NA
    s$`bootstrap CI 97.5%` <- NA
  }
  
  return(list(
    summary=s,
    model=m
  ))
  
}