panstripe: Pangenome Plots

Documented in compare_pangenomes

#' compare_pangenomes
#' 
#'
#' @description Compares the coefficients between two pangenomes datasets by including interaction terms in the GLM.
#'
#' @param fitA a `panfit` object generated by the `panstripe` function
#' @param fitB a `panfit` object generated by the `panstripe` function
#' @param family the family used by glm. One of 'Tweedie', 'Poisson', 'Gamma' or 'Gaussian'. (default='Tweedie')
#' @param intercept whether or not to include an intercept term in the GLM (default=FALSE). Adding an intercept can increase the robustness of the algorithm to errors at higher branches of the phylogeny at the expense of less sensitivity. 
#' @param modeldisp whether or not to model the dispersion as a function of the covariates of interest if using a Tweedie family (default=TRUE)
#' @param ci_type the method used to calculate the bootstrap CI (default='perc'). See \link[boot]{boot.ci} for more details.
#' @param conf A scalar indicating the confidence level of the required intervals (default=0.95)
#' @param nboot the number of bootstrap replicates to perform (default=100)
#' @param boot_pvalue whether or not to calculate bootstrap p-values (default=FALSE)
#'
#' @return a list containing a summary of the comparison and the resulting `glm` model object
#'
#' @examples
#'
#' simA <- simulate_pan(rate=1e-4, ngenomes = 200, mean_trans_size=3, fp_error_rate=1)
#' simB <- simulate_pan(rate=1e-3, ngenomes = 200, mean_trans_size=3, fp_error_rate=1)
#' fitA <- panstripe(simA$pa, simA$tree, nboot=0)
#' fitA$summary
#' fitB <- panstripe(simB$pa, simB$tree, nboot=0)
#' fitB$summary
#' comp <- compare_pangenomes(fitA, fitB, nboot=10)
#' comp$summary
#'
#' @export
compare_pangenomes <- function(fitA, fitB, family="Tweedie", intercept=FALSE, modeldisp=TRUE, 
                               ci_type='perc', conf=0.95, nboot=100, boot_pvalue=FALSE){
  
  # input checks
  if (class(fitA) != 'panfit') stop('fitA is not of class `panfit`!')
  validate_panfit(fitA)
  if (class(fitB) != 'panfit') stop('fitB is not of class `panfit`!')
  validate_panfit(fitB)
  
  #combine data
  dat <- purrr::imap_dfr(list(fitA, fitB), ~{
    tibble::add_column(.x$data, pangenome=.y-1, .before=1)
  })
  
  # check for all 0's
  if ((sum(fitA$data$acc[fitA$data$istip==0])==0) | (sum(fitA$data$acc[fitA$data$istip==1])==0) |
    (sum(fitB$data$acc[fitB$data$istip==0])==0) | (sum(fitB$data$acc[fitB$data$istip==1])==0)) {
    warning("No gene gains/losses identified at all at internal branches or tips in one or both pangenomes!")
  }
  
  # set model
  if (intercept){
    model <- stats::as.formula("acc ~ 0 + pangenome + istip + core + depth + istip:core + depth:pangenome + istip:pangenome + core:pangenome + istip:core:pangenome")
  } else {
    model <- stats::as.formula("acc ~ 0 + istip + core + depth + istip:core + depth:pangenome + istip:pangenome + core:pangenome + istip:core:pangenome")
  }
  
  # set dispersion model
  if (modeldisp){
    dmodel <- stats::as.formula("acc ~ pangenome")
  } else {
    dmodel <- stats::as.formula("acc ~ 1")
  }
  
  # fit model
  if (family=='Tweedie'){
    m <- fit_double_tweedie(model, dmodel=dmodel, data = dat)
    a <- anova.dglm.basic(m, tweedie.power = m$p)
  } else {
    m <- stats::glm(model, dat, family = family)
  }
  
  s <- summary(m)$coefficients %>% 
    tibble::as_tibble(rownames = 'term')
  s <- s[s$term %in% c('istip:pangenome', 'core:pangenome', 'depth:pangenome',
                       'pangenome:istip', 'pangenome:core', 'pangenome:depth'), , drop=FALSE]
  s$term <- gsub("pangenome:", "", s$term)
  s$term <- gsub(":pangenome", "", s$term)
  colnames(s) <- c('term','estimate','std.error','statistic','p.value')
  
  if ((family=='Tweedie') & modeldisp){
    s <- s %>% tibble::add_row(
      term='dispersion model',
      estimate=NA,
      std.error=NA,
      statistic=a$Chisq,
      p.value=a$p.value)
  }
  
  # run bootstrap
  if (nboot>1){
    if (family=='Tweedie'){
      boot_reps <- boot::boot(dat, fit_double_model,
                              R = nboot,
                              stype='i',
                              strata=factor(dat$pangenome),
                              model=model, dmodel=dmodel)
    } else {
      boot_reps <- boot::boot(dat, fit_model,
                              R = nboot,
                              stype='i',
                              strata=factor(dat$pangenome),
                              model=model, family=family, boot_type='branch', fit_method='glm')
    }
    
    ci <- purrr::map_dfr(which(grepl('^[a-z]+:pangenome$', names(m$coefficients)) |
                                 grepl('^pangenome:[a-z]+', names(m$coefficients))), ~{
                                   df <- as.data.frame(t(boot_ci_pval(boot_reps, index=.x, type=ci_type,
                                                                                  theta_null=0, ci_conf=conf,
                                                                                  transformation='identity', calc_pval = boot_pvalue)))
                                   df$term <- gsub("pangenome:|:pangenome", "", names(m$coefficients)[[.x]])
                                   return(df)
                                 })
    
    s$`bootstrap CI 2.5%` <- signif(as.numeric(ci$V1[match(s$term, ci$term)]), 5)
    s$`bootstrap CI 97.5%` <- signif(as.numeric(ci$V2[match(s$term, ci$term)]), 5)
    
    if (boot_pvalue){
      s$`bootstrap p-value` <- signif(as.numeric(ci$V3[match(s$term, ci$term)]), 5)
    }
    
  } else {
    boot_reps <- NULL
    s$`bootstrap CI 2.5%` <- NA
    s$`bootstrap CI 97.5%` <- NA
  }
  
  return(list(
    summary=s,
    model=m,
    data=dat
  ))
}


double_tweedie_llk <- function(p, model, dmodel, data){
  dm <- dglm_mod(model, 
                 dformula = dmodel, 
                 data = data, 
                 family = statmod::tweedie(var.power = p, link.power = 0), 
                 tweedie.var.power=p)
  return(dm$m2loglik)
}

fit_double_tweedie <- function(model, dmodel, data){
  fm <- tryCatch(
    {
      op <- stats::optimise(double_tweedie_llk, lower = 1.01, upper = 1.99, model=model, dmodel=dmodel, data=data)
      tm <- dglm_mod(formula = model,  
                     dformula =  dmodel,
                     data = data,
                     family = statmod::tweedie(var.power = op$minimum, link.power = 0), 
                     tweedie.var.power=op$minimum)
      # check convergence
      ftm <- stats::glm(model, data, family = statmod::tweedie(var.power = op$minimum, link.power = 0))
      tm$p <- op$minimum
      tm$converged <- ftm$converged
      tm
    },
    error=function(cond) {
      stop(
        "Panstripe model fit failed! This can sometime be caused by unusual branch lengths.
Setting family='quasipoisson' or 'gaussian' often provides a more stable fit to difficult datasets"
      )
    }
  )
  if (!fm$converged) {
    warning(
      "Panstripe model fit failed to converge!
Setting family='gaussian' often provides a more stable fit to difficult datasets"
    )
  }
  return(fm)
}

fit_double_model <- function(d, indices=NULL, model, dmodel){
  stopifnot(length(indices)==nrow(d))
  
  tdat <- d[indices,]
  tm <- fit_double_tweedie(model, dmodel = dmodel, data = tdat)
  coef <- c(tm$coefficients, tm$dispersion.fit$coefficients)
  
  return(coef)
}