metacal: Bias estimation and calibration for metagenomics experiments

Documented in center log_center_proj log_center_rss proj_mat

# Center (Compositional mean) -------------------------------------------------

# The function `center()` is the only exported function. It allows for multiple
# methods of computing the center, specified with the `method` argument, and
# for specifying the input and output scales and output format, and does some
# preformating to feed to the functions that compute by the specific methods.

# The functions implementing the various methods are `log_center_*()`; these
# take a matrix on the log scale, and provide as output a named vector on the
# log scale.

# TODO: test these functions behave as expected when there is an instance of
# O(s)_i/A(s)_i = \inf or 0; and add better options in this case (e.g., should
# not center the elements)

#' Compute the center (compositional mean) of a set of compositions
#'
#' Unobserved values should be marked as NaN. 
#'
#' @param .data All-numeric data frame or matrix with taxa as columns.
#' @param weights Sample (row) weights.
#' @param method Method for computing the center: "proj", "gm", or "rss".
#' @param in_scale "linear" or "log".
#' @param out_scale "linear" or "log".
#' @param denom Taxa to use in the denominator; if NULL, use all taxa.
#' @param enframe Whether to return the bias estimate as a two-column tibble.
#'
#' @export
center <- function(.data, weights = rep(1, nrow(.data)), method = "proj",
    in_scale = "linear", out_scale = "linear", denom = NULL, enframe = FALSE, 
    components = FALSE) {
    if (!(in_scale %in% c("linear", "log"))) {
        stop('`in_scale` must be "linear" or "log"')
    }
    if (!(out_scale %in% c("linear", "log")))
        stop('`out_scale` must be "linear" or "log"')
    if (!(method %in% c("proj", "gm", "rss")))
        stop('`method` must be "proj", "gm", or "rss"')
    if ( in_scale == "linear" & !all(.data > 0 | is.nan(.data)) )
        stop('Elements of `.data` must be > 0 or NaN if `in_scale == "linear"`')

    # The log_center_*()'s require a matrix of log compositions
    mat <- .data %>% as("matrix")
    if (in_scale == "linear")
        mat <- log(mat)

    # Currently, this function will not give meaningful results if `mat`
    # contains any +/- Inf or NA values.
    if ( ! all(is.finite(mat) | is.nan(mat)) )
        stop('Elements of log compositions must be finite or NaN')

    # Check that there is only one component in the co-occurence graph (and
    # that it includes all taxa)
    cmps <- cooccurrence_components(mat)
    if ( !components && (length(unique(cmps)) > 1) )
        # stop('Co-occurrance graph has multiple components')
        stop('The center is not fully determined')

    if (method == "proj") {
        b <- log_center_proj(mat, weights)
    } else if (method == "gm") {
        b <- log_center_gm(mat, weights)
    } else if (method == "rss") {
        b <- log_center_rss(mat, weights)
    }

    if (!is.null(denom))
        b <- b - mean(b[denom])

    if (out_scale == "linear")
        b <- exp(b)

    if (components) {
        b <- dplyr::left_join(
            tibble::enframe(b, "Taxon", "Center"),
            tibble::enframe(cmps, "Taxon", "Component"), 
            by = "Taxon"
        )
    } else if (enframe) {
        b <- tibble::enframe(b, "Taxon", "Center")
    }

    b
}

#' Projection matrix P_M
#' 
#' Used internally for the "proj" method of computing the compositional mean.
#' See vandenBoogaart2006 and Bren2008.
#'
#' @param K number of elements (taxa)
#' @param M set of missing elements (default is none missing)
proj_mat <- function(K, M = c()) {
    mat <- diag(nrow = K) - 1/(K - length(M))
    mat[M,] <- 0
    mat[,M] <- 0
    mat
}

# Elements are centered w.r.t. the non-NaN elements.
# TODO: add a test for this ^
log_center_gm <- function(mat, weights = rep(1, nrow(mat))) {
    clr_center <- mat %>%
        apply(2, weighted.mean, weights) %>%
        {. - mean(., na.rm = TRUE)}
    clr_center
}

#' Compute the log center by the projection method
#'
#' Compute the log center by the projection method, which allows for missing
#' observations.
#'
#' @references van den Boogaart KG, Tolosana-Delgado R, Bren M. 2006. Concepts
#' for handling of zeros and missing values in compositional data. Proc IAMG
#' 6:1-4. http://www.stat.boogaart.de/Publications/iamg06_s07_01.pdf
#'
#' @references Bren M, Tolosana-Delgado R, van den Boogaart KG. 2008. News from
#' "compositions", the R package.
#' https://core.ac.uk/download/pdf/132548286.pdf
log_center_proj <- function(mat, weights = rep(1, nrow(mat))) {

    # Proper normalization appears to handled by the ginv matrix, so is
    # unnecessary to normalize the weights.

    K <- ncol(mat)

    mat0 <- mat
    mat0[is.nan(mat0)] <- 0

    P_sum <- diag(0, nrow = K)
    v_sum <- rep(0, K)
    for (i in seq(nrow(mat))) {
        M <- which(is.nan(mat[i,]))
        v <- mat0[i,]
        P <- proj_mat(K, M) * weights[i]
        P_sum <- P_sum + P
        v_sum <- v_sum + P %*% v
    }

    clr_center <- (MASS::ginv(P_sum) %*% v_sum) %>% c
    names(clr_center) <- colnames(mat)
    clr_center
}

#' Compute the log center by numerical optimization
#'
#' @param bound Single number giving the lower and upper bound on the alr
#'   efficiencies ("rss" only).
log_center_rss <- function(mat, weights = rep(1, nrow(mat)), bound = 10) {

    # This function computes the squared Aitchison norm of x on the
    # subcomposition of taxa that are observed.
    # 
    # x is a vector of the log compositional error of a sample
    row_rss <- function (x) {
        x %>%
            {. - mean(., na.rm = TRUE)} %>%
            {.^2} %>%
            sum(., na.rm = TRUE)
    }

    # mat is a matrix of the compositional errors on the log scale
    rss <- function(alr_bias, mat, weights) {
        # Substract the alr_bias from each row (sample)
        swept <- sweep(mat, 2, c(alr_bias, 0), "-")
        # Compute and sum the RSS's, weighting samples by `weights`
        apply(swept, 1, row_rss) %>%
            {. * weights} %>%
            sum
    }

    K <- ncol(mat)

    # Initial alr bias values to be passed to `par` arg of `optim()`
    initial <- rep(0, K-1)

    # Compute minimizer
    alr_center <- optim(initial, fn = rss, mat = mat,
        method = "L-BFGS-B", weights = weights, lower = -bound, upper = bound)

    # Format and return results
    clr_center <- c(alr_center$par, 0) %>%
        {. - mean(.)}
    names(clr_center) <- colnames(mat)
    clr_center
}

# Boostrapping ----------------------------------------------------------------

# TODO: 
# - add tests
# - improve documentation
# - Test handling of log scale and non-default denominator
# - Warn if not all samples have all taxa and dist = "multinomial"

# Choice of two weight distributions - Multinomial(N, rep(1/S, S)), as in the
# classic bootstrap, and Dirichlet(rep(1, S)), as in the Bayesian bootstrap of
# Rubin.

# N - the choice of N for multinomial sampling.

#' Generate bootstrap replicates of the sample center
#'
#' @param .data All-numeric data frame or matrix with taxa as columns.
#' @param R Number of bootstrap replicates.
#' @param N Number of trials for multinomial resampling.
#' @param method Method for computing the center: "proj", "gm", or "rss".
#' @param dist Distribution for drawing the bootstrap weights: "dirichlet" or
#'   "multinomial".
#' @param in_scale "linear" (default) or "log".
#' @param out_scale "linear" (default) or "log".
#' @param denom Taxa to use in the denominator; if NULL, use all taxa.
#' @param enframe Whether to "enframe" the bootstrap estimates in a tibble.
#'
#' @export
bootrep_center <- function(.data, R = 4000, N = nrow(.data), method = "proj",
    dist = "dirichlet", in_scale = "linear", out_scale = "linear",
    denom = NULL, enframe = FALSE) {

    if (!(dist %in% c("multinomial", "dirichlet")))
        stop('`dist` must be "multinomial" or "dirichlet"')

    if (N != nrow(.data) & dist == "dirichlet")
        stop('`N != nrow(.data)` only supported with dist = "multinomial"')

    # Test on the full dataset. Ensures that the bias estimate is fully-defined
    # if all samples get positive weight.
    bhat <- center(.data, method = method, in_scale = in_scale, 
        out_scale = out_scale, denom = denom)

    mat <- .data %>% as("matrix")
    if (in_scale == "linear")
        mat <- log(mat)

    # List of weights for each replicate
    if (dist == "multinomial") {
        # Weights ~ Multinomial(N, rep(1, N0) / N0)
        N0 <- nrow(.data)
        wmat <- rmultinom(R, N, rep(1, N0))
        wlist <- lapply(seq(R), function(i) wmat[,i])
    } else if (dist == "dirichlet") {
        # Weights ~ Dirichlet(rep(1, N))
        # Can get a single draw from a Dirichlet(rep(1, N)) by normalizing a
        # vector of K iid exponential random variables to sum to 1; however,
        # normalization is optional when weights are passed to the weighted
        # center functions.
        wlist <- rep(N, R) %>%
            purrr::map(rexp, rate = 1)
    }
    names(wlist) <- seq_along(wlist)

    # Define the function to compute Bhat
    if (method == "proj") {
        log_center <- function(w, mat) log_center_proj(mat, w)
    } else if (method == "gm") {
        log_center <- function(w, mat) log_center_gm(mat, w)
    } else if (method == "rss") {
        log_center <- function(w, mat) log_center_rss(mat, w)
    }
    # Compute Bhat for each bootstrap replicate
    reps <- purrr::map(wlist, log_center, mat = mat)

    # TODO: Set the denominator in the matix with sweep()
    # Set the denominator
    if (!is.null(denom))
        reps <- purrr::map(reps, ~ . - mean(.[denom]))

    # Combine into a matrix with taxa as columns
    reps <- do.call(rbind, reps)

    if (out_scale == "linear")
        reps <- exp(reps)

    if (enframe) {
      # Join into a single data frame
      reps <- reps %>%
        tibble::as_tibble() %>%
        tibble::rowid_to_column(var = ".id") %>%
        tidyr::pivot_longer(-.id, names_to = "Taxon", values_to = "Center")
    }

    reps
}