R/taxatree_models2stats.R

Defines functions tidy_bbdml taxModel2stats tax_models2stats taxatree_models2stats

Documented in taxatree_models2stats tax_models2stats

#' @title
#' Extract statistics from taxatree_models or tax_model output
#'
#' @description
#' Runs a function e.g. `broom::tidy` on a list of models, i.e. the output of
#' `taxatree_models` or `tax_model`
#'
#' @param data
#' psExtra with attached tax_models or taxatree_models list, or just the list of models
#' @param fun function to assist extraction of stats dataframe from models, or "auto"
#' @param ... extra arguments passed to fun
#' @param .keep_models should the models list be kept in the psExtra output?
#'
#' @return data.frame, attached to psExtra
#' @export
#' @describeIn
#' models2stats
#' Extract stats from list or psExtra output of taxatree_models
#' @examples
#' # This example is an abbreviated excerpt from article on taxon modelling on
#' # the microViz documentation website
#'
#' library(dplyr)
#' data("ibd", package = "microViz")
#'
#' # We'll keep only the Ulcerative Colitis and Healthy Control samples, to
#' # simplify the analyses for this example. We'll also remove the Species
#' # rank information, as most OTUs in this dataset are not assigned to a
#' # species. We'll also use `tax_fix` to fill any gaps where the Genus is
#' # unknown, with the family name or whatever higher rank classification is
#' # known.
#'
#' phylo <- ibd %>%
#'   ps_filter(DiseaseState %in% c("UC", "nonIBD")) %>%
#'   tax_mutate(Species = NULL) %>%
#'   tax_fix()
#'
#' # Let's make some sample data variables that are easier to use and compare
#' # in the statistical modelling ahead. We will convert dichotomous
#' # categorical variables into similar binary variables (values: 1 for true,
#' # or 0 for false). We will also scale and center the numeric variable for
#' # age.
#'
#' phylo <- phylo %>%
#'   ps_mutate(
#'     UC = ifelse(DiseaseState == "UC", yes = 1, no = 0),
#'     female = ifelse(gender == "female", yes = 1, no = 0),
#'     antibiotics = ifelse(abx == "abx", yes = 1, no = 0),
#'     steroids = ifelse(steroids == "steroids", yes = 1, no = 0),
#'     age_scaled = scale(age, center = TRUE, scale = TRUE)
#'   )
#'
#' lm_models <- phylo %>%
#'   tax_fix() %>%
#'   tax_prepend_ranks() %>%
#'   tax_transform("compositional", rank = "Genus") %>%
#'   tax_filter(min_prevalence = 0.1, use_counts = TRUE) %>%
#'   taxatree_models(
#'     type = lm,
#'     trans = "log2", trans_args = list(zero_replace = "halfmin"),
#'     ranks = c("Phylum", "Class", "Genus"),
#'     variables = c("UC", "female", "antibiotics", "steroids", "age_scaled")
#'   )
#'
#' lm_stats <- lm_models %>% taxatree_models2stats()
#'
#' # inspect the psExtra returned, now with taxatree_stats dataframe added
#' lm_stats
#'
#' # inspect the dataframe itself
#' lm_stats %>% taxatree_stats_get()
#'
#' # keep the models on the psExtra object
#' lm_models %>% taxatree_models2stats(.keep_models = TRUE)
#'
#' # you can adjust the p values with taxatree_stats_p_adjust
#'
#' # you can plot the results with taxatree_plots
taxatree_models2stats <- function(data,
                                  fun = "auto",
                                  ...,
                                  .keep_models = FALSE) {
  if (!is.list(data)) check_is_psExtra(x = data, argName = "data")
  if (is(data, "psExtra")) models <- data@taxatree_models
  if (!is.list(models)) {
    stop("data arg psExtra must have 'taxatree_models' list attached")
  }
  if (inherits(data, "list")) {
    models <- data
  } else {
    stopifnot(all(names(models) %in% phyloseq::rank_names(ps_get(data))))
  }

  # get stats
  stats <- lapply(X = names(models), FUN = function(rank) {
    tax_models2stats(data = models[[rank]], rank = rank, fun = fun, ...)
  })
  stats <- purrr::reduce(stats, rbind.data.frame)
  stats <- dplyr::mutate(
    .data = stats,
    dplyr::across(dplyr::all_of(c("term", "rank")), ~ factor(.x, unique(.x)))
  )

  # return psExtra (based on input data class)
  if (is(data, "psExtra")) {
    data@taxatree_stats <- stats
    if (isFALSE(.keep_models)) data@taxatree_models <- NULL
    return(data)
  }
  # otherwise return data frame
  return(stats)
}

#' @param rank
#' string naming rank at which tax_model was run (needed if data is a list)
#'
#' @export
#' @describeIn
#' models2stats
#' Extract stats from list or psExtra output of tax_model
tax_models2stats <- function(data,
                             rank = NULL,
                             fun = "auto",
                             ...,
                             .keep_models = FALSE) {
  if (inherits(data, "list")) {
    if (is.null(rank)) stop("If `data` is just a list, `rank` must not be NULL")
    models <- data
  } else {
    check_is_psExtra(data, "data")
    models <- tax_models_get(data)
    if (!is.list(models)) {
      stop("psExtra must have tax_models list attached, did you use tax_model()?")
    }
    rank <- names(models)
    if (!rlang::is_string(rank, string = phyloseq::rank_names(ps_get(data)))) {
      stop("tax_models list must be length 1 & have name of a rank in psExtra")
    }
    models <- models[[rank]] # remove one layer of nesting
  }

  # if univariable mode was on then this layer is a list e.g. v1:tax1;tax2;...
  if (identical(class(models[[1]]), "list")) models <- purrr::flatten(models)

  # map2 to avoid indexing by names, which are duplicated in univariable mode
  stat_list <- purrr::map2(
    .x = seq_along(models), .y = names(models), .f = function(x, y) {
      taxModel2stats(models[[x]], taxon = y, rank = rank, fun = fun, ...)
    }
  )
  stats <- purrr::reduce(stat_list, rbind.data.frame)

  # return psExtra or data.frame (based on input data class)
  if (is(data, "psExtra")) {
    data@tax_stats <- stats
    if (isFALSE(.keep_models)) data@tax_models <- NULL
    return(data)
  }
  return(stats)
}


# fun: a function (not a string), to compute stats from model, or "auto"
#
taxModel2stats <- function(model,
                           taxon,
                           rank,
                           fun = "auto",
                           dropTerms = "(Intercept)",
                           ...) {
  # check `fun` argument
  if (!inherits(fun, "function") && !identical(fun, "auto")) {
    stop(
      'fun must be a function or "auto", it is class: ',
      paste(class(fun), collapse = " / ")
    )
  }

  # infer tidying function from model type if "auto" fun
  if (identical(fun, "auto")) {
    # for most models, assume broom::tidy has an appropriate method
    fun <- broom::tidy
    # special tidying for corncob bbdml models
    if (inherits(model, "bbdml")) fun <- tidy_bbdml
  }
  # run tidying function, with any extra arguments
  df <- fun(model, ...)
  # check result is actually a dataframe
  if (!inherits(df, "data.frame")) {
    stop(
      "`fun` function must return a data.frame or tibble.\n",
      "It returned an object of class: ", paste(class(df), collapse = " / ")
    )
  }

  # add taxon name and clean up dataframe
  df[["formula"]] <- attr(x = model, which = "formula_string", exact = TRUE)
  if (is.null(df[["term"]])) {
    # in univariable mode with cor.test etc, the term is not added by broom tidy
    # but we can retrieve it from the formula again
    df[["term"]] <- all.vars(stats::as.formula(df$formula[1]), unique = FALSE)[2]
  }
  df[["taxon"]] <- taxon
  df[["rank"]] <- rank
  df <- dplyr::relocate(df, dplyr::any_of(c("term", "taxon", "rank", "formula")))
  df <- dplyr::filter(df, !.data[["term"]] %in% dropTerms)
  return(tibble::as_tibble(df))
}


# gets stats from corncob::bbdml model
tidy_bbdml <- function(model,
                       param = "mu",
                       method = corncob::waldt,
                       ...) {
  # get stats from model
  stats <- method(model, ...)
  df <- as.data.frame(stats, optional = TRUE, stringsAsFactors = FALSE)
  df <- tibble::rownames_to_column(df, var = "rownames")
  # standardise names in accordance with vibe of broom tidy guidelines
  df <- dplyr::rename(
    .data = df,
    p.value = "Pr(>|t|)", t.statistic = "t value",
    std.error = "Std. Error", estimate = "Estimate"
  )
  # bbdml models have mu (abundance) and phi (dispersion) estimates
  df <- tidyr::separate(
    data = df,
    col = "rownames", into = c("parameter", "term"),
    extra = "merge", sep = "[.]", remove = TRUE
  )
  # keep only desired statistical parameter ("mu", "phi" or both?)
  df <- dplyr::filter(df, .data$parameter %in% param)
  return(df)
}
david-barnett/microViz documentation built on April 17, 2025, 4:25 a.m.