R/estimate.R

Defines functions residuals.mc_bias_fit fitted.mc_bias_fit estimate_bias.phyloseq estimate_bias.otu_table estimate_bias.matrix estimate_bias

Documented in estimate_bias estimate_bias.matrix estimate_bias.otu_table estimate_bias.phyloseq

#' Estimate bias from control measurements
#'
#' Estimate bias using the compositional least-squares approach described in
#' McLaren, Willis, and Callahan (2019).
#'
#' Bias is estimated by applying [center()] to the compositional error matrix
#' defined by `observed/actual`, which requires that `observed` and `actual`
#' are non-zero for the same sample-taxa pairs. For convenience, this
#' function will automatically set values in `observed` to 0 whose
#' corresponding entries are 0 in `actual`, but it is up to you to replace 0
#' values in `observed` with a non-zero value (such as a pseudocount).
#'
#' Requirements for `observed` and `actual`: The row and column names (for
#' matrices) or taxa and sample names (for phyloseq objects) must match, but
#' can be in different orders. Any taxa and samples in `observed` but not in
#' `actual` will be dropped prior to estimation.
#'
#' @param observed Abundance matrix of observed compositions.
#' @param actual Abundance matrix of actual or reference compositions for the
#'   same samples and taxa in `observed`.
#' @param margin Matrix margin that corresponds to observations (samples); 
#'   `1` for rows, `2` for columns.
#' @param ... Arguments passed to the matrix method.
#' @param boot Whether to perform bootstrapping.
#' @param times Number of bootstrap replicates.
#'
#' @return A `mc_bias_fit` object with [coef()], [fitted()], [residuals()], and
#'   [summary()] methods.
#' 
#' @seealso [center()] [calibrate()] 
#' 
#' @rdname estimate_bias
#' @export
#'
#' @examples
#' # Load data from the cellular mock communities of Brooks et al 2015
#' dr <- system.file("extdata", package = "metacal")
#' list.files(dr)
#' actual <- file.path(dr, "brooks2015-actual.csv") |>
#'   read.csv(row.names = "Sample") |>
#'   as("matrix")
#' observed <- file.path(dr, "brooks2015-observed.csv") |>
#'   read.csv(row.names = "Sample") |>
#'   subset(select = - Other) |>
#'   as("matrix")
#' sam <- file.path(dr, "brooks2015-sample-data.csv") |> read.csv()
#'
#' # Estimate bias with bootstrapping for error estimation
#' mc_fit <- estimate_bias(observed, actual, margin = 1, boot = TRUE)
#' summary(mc_fit)
estimate_bias <- function(observed, actual, ...) {
  UseMethod("estimate_bias")
}

#' @rdname estimate_bias
#' @method estimate_bias matrix
#' @export
estimate_bias.matrix <- function(observed, 
                                 actual, 
                                 margin, 
                                 boot = FALSE, 
                                 times = 1000) {
  stopifnot(margin %in% 1:2)
  stopifnot(all(rownames(actual) %in% rownames(observed)))
  stopifnot(all(colnames(actual) %in% colnames(observed)))

  # Standardize on samples as rows, then align samples and taxa between the two
  # matrices, dropping any not in `actual`.
  if (margin == 2) {
    observed <- t(observed)
    actual <- t(actual)
  }
  observed <- observed[rownames(actual), colnames(actual)]

  # Handle spurious non-zero observations
  n <- sum(observed[actual == 0] > 0)
  if (n > 0) {
    message(paste("Zeroing", n, "values in `observed`"))
    observed[actual == 0] <- 0
  }

  error <- observed / actual
  estimate <- center(error)

  if (boot)
    bootreps <- bootrep_center(error, R = times)
  else 
    bootreps <- NULL

  structure(class = "mc_bias_fit",
    list(
      observed = observed,
      actual = actual,
      estimate = estimate,
      bootreps = bootreps
    )
  )
}

#' @rdname estimate_bias
#' @method estimate_bias otu_table
#' @export
estimate_bias.otu_table <- function(observed, actual, ...) {
  stopifnot(class(actual) %in% c("otu_table", "phyloseq"))
  stopifnot(all(taxa_names(actual) %in% taxa_names(observed)))
  stopifnot(all(sample_names(actual) %in% sample_names(observed)))

  # Calling `otu_table(actual)` ensures this works whether `actual` is a
  # phyloseq or otu_table object.
  actual <- otu_table(actual)

  # Orient both tables with samples as rows before passing to matrix method,
  # which will adjust row and column order.
  if (taxa_are_rows(observed)) observed <- t(observed)
  if (taxa_are_rows(actual)) actual <- t(actual)

  estimate_bias.matrix(
    as(observed, "matrix"), 
    as(actual, "matrix"),
    margin = 1,
    ...
  )
}

#' @rdname estimate_bias
#' @method estimate_bias phyloseq
#' @export
estimate_bias.phyloseq <- function(observed, actual, ...) {
  stopifnot(class(actual) %in% c("otu_table", "phyloseq"))
  # Calling `otu_table()` on an `otu_table()` just returns itself; hence this
  # call works whether `actual` is a phyloseq or otu_table object.
  estimate_bias.otu_table(otu_table(observed), otu_table(actual), ...)
}

#' @export
fitted.mc_bias_fit <- function(object) {
  # TODO: give a norm "keep" option that keeps the totals in _observed_
  # Or, could be a "scale" param with a "counts" or "original" or "observed"
  # option
  perturb(object$actual, object$estimate, margin = 1, norm = "close")
}

#' @export
residuals.mc_bias_fit <- function(object) {
  # For now, give residuals on the proportion scale; in future, give options
  # for other types.
  observed <- object$observed %>% sweep(., 1, rowSums(.), `/`)
  fitted <- fitted(object)
  observed - fitted
}

#' @export
coef.mc_bias_fit <- function(object) {
  object$estimate
}

#' @export
print.mc_bias_fit <- function(x) {
  cat('A metacal bias fit.', fill = TRUE)
  cat(fill = TRUE)
  cat('Estimated relative efficiencies:', fill = TRUE)
  print(x$estimate)
  if (!is.null(x$bootreps)) {
    cat(fill = TRUE)
    cat('Contains', nrow(x$bootreps), 'bootstrap replicates.', fill = TRUE)
  }
  invisible(x)
}

#' @export
summary.mc_bias_fit <- function(object) {
  s <- structure(list(), class = "mc_bias_fit_summary")

  # If no taxon names, use the integer index
  taxon_names <- names(object$estimate)
  if (is.null(taxon_names))
    taxon_names <- seq_along(object$estimate)

  # Create a tibble with the estimated coefficients, with means and standard
  # errors from the bootreps (if they exist)
  s$coefficients <- tibble::tibble(
    taxon = taxon_names,
    estimate = object$estimate,
  )
  if (!is.null(object$bootreps)) {
    s$coefficients <- s$coefficients %>% 
      tibble::add_column(
        gm_mean = object$bootreps %>% apply(2, gm_mean),
        gm_se = object$bootreps %>% apply(2, gm_sd)
      )
    s$n_bootreps <- nrow(object$bootreps)
  }

  s
}

#' @export
print.mc_bias_fit_summary <- function(x) {
  cat('Summary of a metacal bias fit.', fill = TRUE)
  cat(fill = TRUE)
  cat('Estimated relative efficiencies:', fill = TRUE)
  print(x$coefficients)
  if (!is.null(x$n_bootreps)) {
    cat(fill = TRUE)
    cat('Geometric standard error estimated from', x$n_bootreps, 
      'bootstrap replicates.', 
      fill = TRUE)
  }
  invisible(x)
}
mikemc/metacal documentation built on Feb. 20, 2022, 1:46 a.m.