R/nb_mods.R

Defines functions nb_mods

Documented in nb_mods

#' @title Fit negative binomial models to each taxa within an OTU table
#' @name nb_mods
#' @description Fit negative binomial models to each taxa within an OTU table through \code{\link[MASS]{glm.nb}} in the \pkg{MASS} package. Models can include a random effect if desired. Modesl will then be fit through \code{\link[lmer]{glmer.nb}} in the lmer package. Summaries for models or confidence intervals that fail to converge will not be returned, but taxa summaries will be provided in the output. Rank-Sum tests or presence/absence tests can be run on these taxa using \code{tidi_rank_sum} or \code{tidi_chisq}, respectively
#' @param micro_set A tidy_micro data set
#' @param table OTU table of interest
#' @param ... Covariates of interest. Can be interactions such as Group*Age
#' @param RE A character string specifying the random effect term in the syntax of a lmer model; i.e. a random intercept for subject would be written as RE = "(1|Subject)"
#' @param Offset Logical; include subject sequencing depth as an offset for negative binomial models. This is highly recommended
#' @param ref A character vector of the desired reference levels for each factor covariate. The order of the specifed references must match the order for the corresponding covariates specified in '...'
#' @param SS_type Type of sums of squares calculated in \code{\link[car]{Anova}}. Either type II (2) or type III (3) sums of squares
#' @references \code{\link[car]{Anova}}, \code{\link[MASS]{glm.nb}}, \code{\link[lmer]{glmer.nb}}
#' @details Models containing only fixed effects are fit using \code{\link[MASS]{glm.nb}} in the \pkg{MASS} package and models containing random effects are fit using \code{\link[lmer]{glmer.nb}}. ANOVA / ANCOVA tests are conducted using a Likelihood Ratio test for fixed effects models and Chi-Squared tests for random effect models.
#' @return A list containing several different model components and summaries
#' \item{Convergend_Summary}{A data.frame of model summaries from convergent models. Includes the Taxa name, the model coefficient, the estimated beta, the beta's 95 percent confidence interval, Z score, p_value, false discovery rate p-value}
#' \item{Estimate_Summary}{A data.frame of model estimates from convergent models intended to be ready for export for publications. Includes the Taxa name, the model coefficient, the estimated Rate Ratio, the Wald 95 percent confidence interval, the Z-score, and false discovery rate p-value}
#' \item{RA_Summary}{A data.frame of taxa summaries. Includes the Taxa name, grouping variables (each factor variable in your models), sample size (n), percent of 0 counts, basic summaries of relative abundance, percentiles of relative abundance, and a logical indicator of whether or not the model converged}
#' \item{formula}{The formula used in the model}
#' \item{Model_Coef}{Model coefficients (used in plotting funcitons)}
#' \item{Model_Covs}{Model covariates (used in plotting functions)}
#' @note False Discovery Rate p-values are calculated using \code{\link[stats]{p.adjust}}. Estimated rate ratios and confidence intervals for interactions in the Estimate_Summary table include all main effects. It is not simply the exponentiated interaction beta, it is the interaction of the sum of the intercept, corresponding main effect betas, and interaction betas
#' @examples
#' data(phy); data(cla); data(ord); data(fam); data(met)
#'
#' otu_tabs <- list(Phylum = phy, Class = cla, Order = ord, Family = fam)
#' set <- tidy_micro(otu_tabs = otu_tabs, meta = met)
#'
#' ## Fixed effects only
#' nb_fam <- set %>%
#' filter(day == 7) %>% ## Only including first week
#' mutate(bpd1 = factor(bpd1)) %>%
#'
#' ## Filtering out low abundance and unclassified taxa
#' ## These models will either break or we don't care about them
#' otu_filter(ra_cutoff = 0.1, exclude_taxa = c("Unclassified", "Bacteria")) %>%
#'
#' ## Negative binomial models for each Family of taxa with bpd1 as a covariate
#' nb_mods(table = "Family", bpd1)
#'
#' names(nb_fam)
#' nb_fam$Estimate_Summary
#'
#'
#' ## Random Intercept for Subject
#' nb_re <- set %>%
#' mutate(bpd1 = factor(bpd1)) %>%
#'
#' ## Filtering out low abundance and unclassified taxa
#' ## These models will either break or we don't care about them
#' otu_filter(ra_cutoff = 0.1, exclude_taxa = c("Unclassified", "Bacteria")) %>%
#'
#' ## Negative binomial models for each Family of taxa with bpd1 as a covariate
#' nb_mods(table = "Family", bpd1, RE = "(1|day)")
#'
#' names(nb_re)
#' nb_re$Estimate_Summary
#' @export
nb_mods <- function(micro_set, table, ..., RE = NULL, Offset=TRUE, ref=NULL, SS_type){

  if(table %nin% unique(micro_set$Table)) stop("Specified table is not in supplied micro_set")

  micro_set %<>% dplyr::filter(Table == table) %>% ## Filtering out by table
    dplyr::mutate(Taxa = factor(Taxa, levels = unique(Taxa)))
  ## Making Taxa levels the Taxa names w/in table for tapply

  ## Defining formula for glm.nb with  w/ or w/o and Offset. Offset is the default

  f <- formula_fun(...)
  if(Offset) f <- paste(f, "+ offset(log(Total))")

  ## relevels the factor variables
  if(!is.null(ref)){
    if(!is.character(ref)) stop("ref must be a character string of valid reference levels")
    Cov <- cov_str(...) %>%
      dplyr::select(micro_set,.)              ## pulling out model covariates

    Fac <- Cov[sapply(Cov,class) == "factor"]   ## selecting factor variables

    ## for loops are bad, but this should happen quickly (no need to apply)
    ## relevels each factor
    for(i in 1:length(ref)){
     Fac[i] <- relevel(Fac[,i], ref = ref[i])
    }

    Cov <- data.frame(Fac, Cov[sapply(Cov,class) != "factor"])

    micro_set <- data.frame(
      micro_set[which( !(names(micro_set) %in% names(Cov)) )],
      Cov)
  } else {
    Cov <- cov_str(...) %>%
      dplyr::select(micro_set,.)              ## pulling out model covariates
  }

  if(missing(SS_type)) SS_type <- 2
  if(SS_type != 2 & SS_type != 3 ) stop('SS_type must be either 2 or 3')

  ## Fixed effects models
  if(is.null(RE)){

    Convergent_Models <- micro_set %>%
      FE.con(f, SS_type, !!!rlang::quos(...)) ## Gives summaries of convergent models

    ## Gives Taxa summaries
    RA_Summary <- micro_set %>% N.con(ra,!!!rlang::quos(...)) %>%
      left_join(Convergent_Models %>%
                  dplyr::select(Taxa, FE_Converged), by = "Taxa") %>%
      dplyr::arrange(FE_Converged, Taxa) %>% unique

    ## TRUE / FALSE for taxa converging
    con_mod <- RA_Summary %>%
      dplyr::ungroup() %>%
      dplyr::distinct(Taxa, .keep_all = T) %>%
      dplyr::pull(FE_Converged)

    cat("\n", sum(con_mod), " taxa converged\n", sep = "")
    cat(sum(!con_mod), "taxa did not converge\n")

    result <- list(Convergent_Summary=Convergent_Models %>%  ## Convergent model summaries
           dplyr::filter(FE_Converged) %>%
           dplyr::select(Taxa, Coef, Beta, CI, Z, P_val, FDR_Pval, Anova),
         Estimate_Summary=Convergent_Models %>%
           dplyr::filter(FE_Converged) %>%
           dplyr::select(Taxa, Coef, RR, CI_95, Z, FDR_Pval) %>%
           dplyr::filter(Coef != "(Intercept)"),
         RA_Summary=RA_Summary,  ## Nonconvergent summaries
         formula=f,  ## The formula used so you can check it is what you intended
         Model_Coef = Convergent_Models %>%
           dplyr::filter(FE_Converged) %>%
           dplyr::select(Taxa, Coef, Intercept, Estimate = Beta, Cov_Type),
         Model_Covs = Cov
    )
  } else if(!is.null(RE)){ ## Random Effect

    ## Adding randome effect
    f <- paste(f,"+", RE)

    Convergent_Models <- micro_set %>%
      RE.con(f, SS_type, !!!rlang::quos(...), RE = RE) ## Gives summaries of convergent models

    ## Gives Taxa summaries
    RA_Summary <- micro_set %>% N.con(ra,!!!rlang::quos(...)) %>%
      left_join(Convergent_Models %>%
                  dplyr::select(Taxa, RE_Converged), by = "Taxa") %>%
      arrange(RE_Converged, Taxa) %>% unique

    ## TRUE / FALSE for taxa converging
    con_mod <- RA_Summary %>%
      dplyr::ungroup() %>%
      dplyr::distinct(Taxa, .keep_all = T) %>%
      dplyr::pull(RE_Converged)

    cat("\n", sum(con_mod), " taxa converged\n", sep = "")
    cat(sum(!con_mod), "taxa did not converge\n")

    result <- list(Convergent_Summary = Convergent_Models %>%  ## Convergent model summaries
           dplyr::filter(RE_Converged) %>%
           dplyr::select(Taxa, Coef, Beta, CI, Z, P_val, FDR_Pval, Anova),
         Estimate_Summary = Convergent_Models %>%
           dplyr::filter(RE_Converged) %>%
           dplyr::select(Taxa, Coef, RR, CI_95, Z, FDR_Pval),
         RA_Summary = RA_Summary,  ## Nonconvergent summaries
         formula = f,  ## The formula used so you can check it is what you intended
         Model_Coef = Convergent_Models %>%
           dplyr::select(Taxa, Coef, Intercept, Estimate = Beta, Cov_Type),
         Model_Covs = Cov
    )
  }
  result
}


## Function for summarizing convergent models
FE.con <- function(micro_set, f, SS_type, ...){
  ## Defining formula for glm.nb with  w/ or w/o and Offset. Offset is the default

  ## Summary Stats of convergent models
  conv <- plyr::ldply(unique(micro_set$Taxa), function(x){
    m <- try(MASS::glm.nb(as.formula(f), data = micro_set %>%  ## The model
                      dplyr::filter(Taxa == as.character(x))), silent = T)
    if(!inherits(m, "try-error")){
    ## Stops if confint doesn't work. Some models converge but confint breaks
    CI <- try(stats::confint(m), silent = T)
    AA <- try(car::Anova(m), silent = T)
    if(!inherits(CI, "try-error") & !any(is.na(CI)) & !inherits(AA, "try-error")){
    data.frame(
      Taxa = as.character(x),                        ## Taxa Names
      Coef = names(m$coefficients),                  ## Coefficient Names
      RR = coef_est(m, ..., RE = NULL) %>%
                    .$Est %>% exp %>% round(4),      ## Risk Ratios
      CI_95 = ci_est(m, ..., RE = NULL),             ## Wald intervals
      Z = coef(summary(m))[,3] %>% round(4),         ## Z statistic
      P_val = coef(summary(m))[,4],                  ## P_value (not rounded for FDR calculation)
      Intercept = coef(summary(m))[1,1],             ## Intercepts for calculations
      Beta = m$coefficients,                         ## Raw Coefs for later use
      Cov_Type = cov_type(names(m$coefficients),
                          ..., RE = NULL),  ## Covariate types for Bar Charts (no random effects)
      ## CI for individual Beta's (profile likelihood)
      CI = paste0("(", round(CI[,1],4), ", ", round(CI[,2],4), ")"),
      FE_Converged = TRUE
    ) %>% Anova_SS(m, SS_type, test = "LR") ## Anova for each covariate
    } else{
      data.frame(
        Taxa = as.character(x),
        Coef = character(1), RR = NA, CI_95 = character(1), Z = NA, P_val = NA, Intercept = NA,
        Beta = NA, Cov_Type = character(1), CI = character(1), FE_Converged = FALSE, Anova = NA)
      }
    } else {
      data.frame(
        Taxa = as.character(x),
        Coef = character(1), RR = NA, CI_95 = character(1), Z = NA, P_val = NA, Intercept = NA,
        Beta = NA, Cov_Type = character(1), CI = character(1), FE_Converged = FALSE, Anova = NA)
        }
    })

  if(nrow(conv) == 0) stop("No taxa models converged.")

  ## False Discovery Rate adjustment
  conv %>% dplyr::mutate(FDR_Pval = stats::p.adjust(P_val, method = "BH") %>% round(4))
}

RE.con <- function(micro_set, f, SS_type, ..., RE){

  ## Summary Stats of convergent models
  conv <- plyr::ldply(unique(micro_set$Taxa), function(x){
    m <- try(lme4::glmer.nb(as.formula(f), data = micro_set %>%  ## The model
                    dplyr::filter(Taxa == as.character(x))), silent = T)

    if(!inherits(m, "try-error")){
    CI <- try(stats::confint(m), silent = T)
    AA <- try(car::Anova(m), silent = T)
    if(!inherits(CI, "try-error") & !any(is.na(CI)) & !inherits(AA, "try-error")){
      data.frame(
        Taxa = as.character(x),                             ## Taxa Names
        Coef = coef(summary(m)) %>% rownames,               ## Coefficient Names
        RR = coef_est(m, !!!rlang::quos(...), RE = RE) %>%
          .$Est %>% exp %>% round(4),    ## Risk Ratios
        CI_95 = ci_est(m, !!!rlang::quos(...), RE = RE),           ## Upper profile likelihood conidence limits
        Z = coef(summary(m))[,3] %>% round(4),              ## Z statistic
        P_val = coef(summary(m))[,4],                       ## P_value (not rounded for FDR calculation)
        Intercept = m@beta[1],                  ## Intercepts for calculations
        Beta = m@beta,                    ## Raw Coefs for later use
        Cov_Type = cov_type(names(coef(summary(m))[,1]),
                            !!!rlang::quos(...), RE = RE),  ## Covariate types for Bar Charts
        ## CI for betas
        CI = paste("(", CI[-1,1] %>% round(4),", ",
                   CI[-1,2] %>% round(4), ")", sep = ""),
        RE_Converged = TRUE
      ) %>% Anova_SS(m, SS_type, test = "Chisq" ) ## Anova for each covariate
    } else{
      data.frame(
        Taxa = as.character(x),
        Coef = character(1), RR = NA, CI_95 = character(1), Z = NA, P_val = NA, Intercept = NA,
        Beta = NA, Cov_Type = character(1), CI = character(1), FE_Converged = FALSE, Anova = NA)
    }} else{
      data.frame(
        Taxa = as.character(x),
        Coef = character(1), RR = NA, CI_95 = character(1), Z = NA, P_val = NA, Intercept = NA,
        Beta = NA, Cov_Type = character(1), CI = character(1), FE_Converged = FALSE, Anova = NA)
    }
  })

  if(nrow(conv) == 0) stop("No taxa models converged.")

  ## False Discovery Rate adjustment
  conv %>% dplyr::mutate(FDR_Pval = stats::p.adjust(P_val, method = "BH") %>% round(4))
}

## Add in summary measures of diversity measures?
  ## It could just be the "obj"
## Function for summarizing Taxa for all Taxa or nonconvergent taxa from NB_summary
N.con <- function(x, obj = ra, ...){
  obj <- rlang::enquo(obj)
  if(missing(...)){ ## Don't always need ... (the covariates)
    x %>%
      taxa_summarize(!!obj) %>%
      return()
  }
  else{
    grp <- cov_str(...)
    x %>% dplyr::select(Taxa,!!obj, grp) %>%
      dplyr::group_by_if(is.factor) %>%  ## Groups by covariates when given
      taxa_summarize(!!obj) %>%
      return()
  }
}



## Helper functions
# MK: I moved this source to the set-up code in tidy_MIBI
# source('~/Documents/Research/Current/Jed_Stuff/03_tidy_microbiome/Code/Summary_Helper.R')

# ## Fixed Effects Negative Binomial Models
# FE_mod <- function(d,f){
#
#   ## Models might converge, but have problematic fitting issues.
#   options(warn = 2) ## Changes warnings to errors for the try() statements
#
#   ## Pulling out convergent Taxa
#   conv <- unique(d$Taxa)[ !sapply(unique(d$Taxa), function(x,f){
#     try(glm.nb(as.formula(f), data = d %>%
#                  filter(Taxa == as.character(x)), maxit = 5000), silent = T) %>%
#       inherits("try-error")
#   }, f=f) ]
#
#   dc <- d %>% filter(Taxa %in% conv)
#
#   ## Repeating the process just to make sure the models converge
#   Converged <- unique(dc$Taxa)[ !sapply(unique(dc$Taxa), function(x,f){
#     try(glm.nb(as.formula(f), data = dc %>%
#                  filter(Taxa==as.character(x)), maxit=5000), silent = T) %>%
#       inherits("try-error")
#   }, f=f) ]
#
#   options(warn = 0) ## Making warnings warnings again
#
#   d %>% mutate(FE_Converged = Taxa %in% Converged) %>%
#     return()  ## Returns data filtered by table with new column. Mostly for piping into NB_summary
# }
#
# ## Random Effects Negative Binomial Models
# RE_mod <- function(d,f){
#   ## Pulling out convergent Taxa
#   conv <- unique(d$Taxa)[ !sapply(unique(d$Taxa), function(x,f){
#     try(glmer.nb(as.formula(f), data = d %>%
#                    filter(Taxa==as.character(x))),silent = T) %>%
#       inherits("try-error")
#   }, f=f) ]
#
#   dc <- d %>% filter(Taxa %in% conv)
#
#   ## Repeating the process just to make sure the models converge
#   Converged <- unique(dc$Taxa)[ !sapply(unique(dc$Taxa), function(x,f){
#     try(glmer.nb(as.formula(f), data = dc %>%
#                    filter(Taxa==as.character(x))),silent = T) %>%
#       inherits("try-error")
#   }, f=f) ]
#
#   d %>% mutate(RE_Converged = Taxa %in% Converged) %>%
#     return()  ## Returns data filtered by table with new column. Mostly for piping into NB_summary
# }
CharlieCarpenter/tidy.micro documentation built on Jan. 19, 2020, 6:28 p.m.