R/sim_bnb.r
In depower: Power Analysis for Differential Expression Studies

Documented in sim_bnb

#' Simulate data from a BNB distribution
#'
#' Simulate data from the bivariate negative binomial (BNB) distribution. The
#' BNB distribution is used to simulate count data where the event counts are
#' jointly dependent (correlated). For independent data, see
#' [depower::sim_nb()].
#'
#' The negative binomial distribution may be defined using a gamma-Poisson
#' mixture distribution. In this case, the Poisson parameter \eqn{\lambda}
#' is a random variable with gamma distribution. Equivalence between different
#' parameterizations are demonstrated below:
#'
#' ```{r, gamma_poisson_mixture, dev=c('svg','cairo_pdf'), fig.width=4, fig.height=4, fig.show='hide'}
#' # Define constants and their relationships
#' n <- 10000
#' dispersion <- 8
#' mu <- 4
#' p <- dispersion / (dispersion + mu)
#' q <- mu / (mu + dispersion)
#' variance <- mu + (mu^2 / dispersion)
#' rate <- p / (1 - p)
#' scale <- (1 - p) / p
#'
#' # alternative formula for mu
#' mu_alt <- (dispersion * (1 - p)) / p
#' stopifnot(isTRUE(all.equal(mu, mu_alt)))
#'
#' set.seed(20240321)
#'
#' # Using built-in rnbinom with dispersion and mean
#' w <- rnbinom(n = n, size = dispersion, mu = mu)
#'
#' # Using gamma-Poisson mixture with gamma rate parameter
#' x <- rpois(
#'   n = n,
#'   lambda = rgamma(n = n, shape = dispersion, rate = rate)
#' )
#'
#' # Using gamma-Poisson mixture with gamma scale parameter
#' y <- rpois(
#'   n = n,
#'   lambda = rgamma(n = n, shape = dispersion, scale = scale)
#' )
#'
#' # Using gamma-Poisson mixture with multiplicative mean and
#' # gamma scale parameter
#' z <- rpois(
#'   n = n,
#'   lambda = mu * rgamma(n = n, shape = dispersion, scale = 1/dispersion)
#' )
#'
#' # Compare CDFs
#' par(mar=c(4,4,1,1))
#' plot(
#'   x = sort(w),
#'   y = (1:n)/n,
#'   xlim = range(c(w,x,y,z)),
#'   ylim = c(0,1),
#'   col = 'green',
#'   lwd = 4,
#'   type = 'l',
#'   main = 'CDF'
#' )
#' lines(x = sort(x), y = (1:n)/n, col = 'red', lwd = 2)
#' lines(x = sort(y), y = (1:n)/n, col = 'yellow', lwd = 1.5)
#' lines(x = sort(z), y = (1:n)/n, col = 'black')
#' ```
#'
#' \if{html}{\out{<div style="display: flex; justify-content: center; padding-top: 10px; padding-bottom: 10px;">
#'   <img style="max-width: 100\%; height: auto;" src="figures/gamma_poisson_mixture-1.svg" alt="Gamma-Poisson Mixture CDF" />
#' </div>
#' <br>}}
#' \if{latex}{
#'   \out{\begin{center}}
#'   \figure{gamma_poisson_mixture-1.pdf}{options: width=3in}
#'   \out{\end{center}}
#' }
#'
#' The BNB distribution is implemented by compounding two conditionally
#' independent Poisson random variables \eqn{X_1 \mid G = g \sim \text{Poisson}(\mu g)}
#' and \eqn{X_2 \mid G = g \sim \text{Poisson}(r \mu g)} with a gamma random
#' variable \eqn{G \sim \text{Gamma}(\theta, \theta^{-1})}. The probability mass
#' function for the joint distribution of \eqn{X_1, X_2} is
#'
#' \deqn{
#' P(X_1 = x_1, X_2 = x_2) = \frac{\Gamma(x_1 + x_2 + \theta)}{(\mu + r \mu + \theta)^{x_1 + x_2 + \theta}}
#'   \frac{\mu^{x_1}}{x_1!} \frac{(r \mu)^{x_2}}{x_2!}
#'   \frac{\theta^{\theta}}{\Gamma(\theta)}
#' }
#'
#' where \eqn{x_1,x_2 \in \mathbb{N}^{\geq 0}} are specific values of the count
#' outcomes, \eqn{\theta \in \mathbb{R}^{> 0}} is the `dispersion` parameter
#' which controls the dispersion and level of correlation between the two
#' samples (otherwise known as the shape parameter of the gamma distribution),
#' \eqn{\mu \in \mathbb{R}^{> 0}} is the mean parameter, and
#' \eqn{r = \frac{\mu_2}{\mu_1} \in \mathbb{R}^{> 0}} is the `ratio` parameter
#' representing the multiplicative change in the mean of the second sample
#' relative to the first sample. \eqn{G} denotes the random subject effect and
#' the gamma distribution scale parameter is assumed to be the inverse of the
#' dispersion parameter (\eqn{\theta^{-1}}) for identifiability.
#'
#' Correlation decreases from 1 to 0 as the dispersion parameter increases from
#' 0 to infinity. For a given dispersion, increasing means also increases the
#' correlation. See 'Examples' for a demonstration.
#'
#' See 'Details' in [depower::sim_nb()] for additional information on the
#' negative binomial distribution.
#'
#' @references
#' \insertRef{yu_2020}{depower}
#'
#' \insertRef{rettiganti_2012}{depower}
#'
#' \insertRef{aban_2009}{depower}
#'
#' @param n (integer: `[2, Inf)`)\cr
#'        The number(s) of paired observations.
#' @param mean1 (numeric: `(0, Inf)`)\cr
#'        The mean(s) of sample 1 \eqn{(\mu_1)}.
#' @param mean2,ratio (numeric: `(0, Inf)`)\cr
#'        Only specify one of these arguments.
#' - `mean2`: The mean(s) of sample 2 \eqn{(\mu_2)}.
#' - `ratio`: The ratio(s) of means for sample 2 with respect to sample 1
#'        \eqn{\left( r = \frac{\mu_2}{\mu_1} \right)}.
#'
#' `mean2 = ratio * mean1`
#' @param dispersion (numeric: `(0, Inf)`)\cr
#'        The gamma distribution shape parameter(s) \eqn{(\theta)} which control
#'        the dispersion and the correlation between sample 1 and 2.
#'        See 'Details' and 'Examples'.
#' @param nsims (Scalar integer: `1L`; `[1, Inf)`)\cr
#'        The expected number of simulated data sets. If `nsims > 1`, the data
#'        is returned in a list-column of a depower simulation data frame.
#'        `nsims` may be reduced depending on `max_zeros`.
#' @param return_type (string: `"list"`; `c("list", "data.frame")`)\cr
#'        The data structure of the simulated data. If `"list"` (default), a
#'        list object is returned. If `"data.frame"` a data frame in tall format
#'        is returned. The list object provides computational efficiency and the
#'        data frame object is convenient for formulas. See 'Value'.
#' @param max_zeros (Scalar numeric: `0.99`; `[0, 1]`)\cr
#'        The maximum proportion of zeros each group in a simulated dataset is
#'        allowed to have. If the proportion of zeros is greater than this
#'        value, the corresponding data is excluded from the set of simulations.
#'        This is most likely to occur when the sample size is small and the
#'        dispersion parameter is small.
#' @param ncores (Scalar integer: `1L`; `[1,Inf)`)\cr
#'        The number of cores (number of worker processes) to use. Do not set
#'        greater than the value returned by [parallel::detectCores()]. May be
#'        helpful when the number of parameter combinations is large and `nsims`
#'        is large.
#'
#' @return If `nsims = 1` and the number of unique parameter combinations is
#' one, the following objects are returned:
#' - If `return_type = "list"`, a list:
#' \tabular{lll}{
#'   Slot \tab Name \tab Description \cr
#'   1 \tab \tab Simulated counts from sample 1. \cr
#'   2 \tab \tab Simulated counts from sample 2.
#' }
#' - If `return_type = "data.frame"`, a data frame:
#' \tabular{lll}{
#'   Column \tab Name \tab Description \cr
#'   1 \tab `item` \tab Subject/item indicator. \cr
#'   2 \tab `condition` \tab Sample/condition indicator. \cr
#'   3 \tab `value` \tab Simulated counts.
#' }
#'
#' If `nsims > 1` or the number of unique parameter combinations is greater than
#' one, each object described above is returned in a list-column named `data` in
#' a depower simulation data frame:
#' \tabular{lll}{
#'   Column \tab Name \tab Description \cr
#'   1 \tab `n1` \tab Sample size of sample 1. \cr
#'   2 \tab `n2` \tab Sample size of sample 2. \cr
#'   3 \tab `mean1` \tab Mean for sample 1. \cr
#'   4 \tab `mean2` \tab Mean for sample 2. \cr
#'   5 \tab `ratio` \tab Ratio of means (sample 2 / sample 1). \cr
#'   6 \tab `dispersion` \tab Gamma distribution shape parameter (dispersion). \cr
#'   7 \tab `nsims` \tab Number of valid simulation iterations. \cr
#'   8 \tab `distribution` \tab Distribution sampled from. \cr
#'   9 \tab `data` \tab List-column of simulated data.
#' }
#'
#' @seealso [depower::sim_nb()], [stats::rpois()], [stats::rgamma()],
#' [stats::rnbinom()]
#'
#' @examples
#' #----------------------------------------------------------------------------
#' # sim_bnb() examples
#' #----------------------------------------------------------------------------
#' library(depower)
#'
#' # Paired two-sample data returned in a data frame
#' sim_bnb(
#'   n = 10,
#'   mean1 = 10,
#'   ratio = 1.6,
#'   dispersion = 3,
#'   nsims = 1,
#'   return_type = "data.frame"
#' )
#'
#' # Paired two-sample data returned in a list
#' sim_bnb(
#'   n = 10,
#'   mean1 = 10,
#'   ratio = 1.6,
#'   dispersion = 3,
#'   nsims = 1,
#'   return_type = "list"
#' )
#'
#' # Two simulations of paired two-sample data
#' # returned as a list of data frames
#' sim_bnb(
#'   n = 10,
#'   mean1 = 10,
#'   ratio = 1.6,
#'   dispersion = 3,
#'   nsims = 2,
#'   return_type = "data.frame"
#' )
#'
#' # Two simulations of Paired two-sample data
#' # returned as a list of lists
#' sim_bnb(
#'   n = 10,
#'   mean1 = 10,
#'   ratio = 1.6,
#'   dispersion = 3,
#'   nsims = 2,
#'   return_type = "list"
#' )
#'
#' #----------------------------------------------------------------------------
#' # Visualization of the BNB distribution as dispersion varies.
#' #----------------------------------------------------------------------------
#' set.seed(1234)
#' data <- lapply(
#'   X = c(1, 10, 100, 1000),
#'   FUN = function(x) {
#'     d <- sim_bnb(
#'       n = 1000,
#'       mean1 = 10,
#'       ratio = 1.5,
#'       dispersion = x,
#'       nsims = 1,
#'       return_type = "data.frame"
#'     )
#'     cor <- cor(
#'       x = d[d$condition == "1", ]$value,
#'       y = d[d$condition == "2", ]$value
#'     )
#'     cbind(dispersion = x, correlation = cor, d)
#'   }
#' )
#'
#' data <- do.call(what = "rbind", args = data)
#'
#' ggplot2::ggplot(
#'   data = data,
#'   mapping = ggplot2::aes(x = value, fill = condition)
#' ) +
#'   ggplot2::facet_wrap(
#'     facets = ggplot2::vars(.data$dispersion),
#'     ncol = 2,
#'     labeller = ggplot2::labeller(.rows = ggplot2::label_both)
#'   ) +
#'   ggplot2::geom_density(alpha = 0.3) +
#'   ggplot2::coord_cartesian(xlim = c(0, 60)) +
#'   ggplot2::geom_text(
#'     mapping = ggplot2::aes(
#'       x = 30,
#'       y = 0.12,
#'       label = paste0("Correlation: ", round(correlation, 2))
#'     ),
#'     check_overlap = TRUE
#'   ) +
#'   ggplot2::labs(
#'     x = "Value",
#'     y = "Density",
#'     fill = "Condition",
#'     caption = "Mean1=10, Mean2=15, ratio=1.5"
#'   )
#'
#' @importFrom stats rgamma rpois
#'
#' @export
sim_bnb <- function(
    n,
    mean1,
    mean2,
    ratio,
    dispersion,
    nsims = 1L,
    return_type = "list",
    max_zeros = 0.99,
    ncores = 1L
) {
  #-----------------------------------------------------------------------------
  # Check arguments
  #-----------------------------------------------------------------------------
  if(!is.numeric(n) || any(n < 2L)) {
    stop("Argument 'n' must be an integer vector from [2, Inf).")
  }
  if(!is.numeric(mean1) || any(mean1 <= 0)) {
    stop("Argument 'mean1' must be a positive numeric vector.")
  }

  missing_mean2 <- missing(mean2)
  missing_ratio <- missing(ratio)
  if(missing_mean2 && missing_ratio) {
    stop("You must specify one of the arguments: 'mean2' or 'ratio'.")
  }
  if(!missing_mean2 && !missing_ratio) {
    stop("Arguments 'mean2' and 'ratio' were both specified. You may only specify one.")
  }
  if(missing_ratio) {
    if(!is.numeric(mean2) || any(mean2 <= 0)) {
      stop("Argument 'mean2' must be a positive numeric vector.")
    }
    ratio_keep <- NULL # For filtering step below
    ratio <- NULL # For checking length(ratio)
  }
  if(missing_mean2) {
    if(!is.numeric(ratio) || any(ratio <= 0)) {
      stop("Argument 'ratio' must be a positive numeric vector.")
    }
    mean2 <- as.numeric(tcrossprod(ratio, mean1))
    # For filtering step below
    # Numerical accuracy should be fine to 5 decimals...
    ratio_keep <- round(ratio, 5)
  }

  if(!is.numeric(dispersion) || any(dispersion <= 0)) {
    stop("Argument 'dispersion' must be a positive numeric vector.")
  }
  if(!is.numeric(nsims) || length(nsims) != 1L || nsims < 1L) {
    stop("Argument 'nsims' must be a positive scalar integer.")
  }
  if(length(return_type) != 1L) {
    stop("Argument 'return_type' must be one of 'list' or 'data.frame'.")
  }
  data.frame <- switch(return_type,
    "list" = FALSE,
    "data.frame" = TRUE,
    stop("Argument 'return_type' must be one of 'list' or 'data.frame'.")
  )
  if(!is.numeric(max_zeros) || length(max_zeros) != 1L || max_zeros < 0 || max_zeros > 1) {
    stop("Argument 'max_zeros' must be a scalar numeric from [0,1].")
  }
  if(!is.numeric(ncores) || length(ncores) != 1L || ncores < 1L) {
    stop("Argument 'ncores' must be a positive scalar integer.")
  }
  if(ncores > 1L) {
    if(isTRUE(ncores > parallel::detectCores())) {
      max <- parallel::detectCores()
      warning("Argument 'ncores' should not be greater than ", max, ".")
    }
    cluster <- multidplyr::new_cluster(ncores)
    multidplyr::cluster_library(cluster, 'depower')
  }

  needs_grid <- any(c(nsims, lengths(list(n, mean1, mean2, ratio, dispersion))) > 1L)

  #-----------------------------------------------------------------------------
  # Simulate data
  #-----------------------------------------------------------------------------
  res <- if(needs_grid) {
    grid_bnb(
      n = n,
      mean1 = mean1,
      mean2 = mean2,
      ratio = ratio,
      dispersion = dispersion,
      nsims = nsims,
      data.frame = data.frame,
      max_zeros = max_zeros,
      ncores = ncores,
      ratio_keep = ratio_keep
    )
  } else {
    sim_bnb_two_sample(
      n = n,
      mean1 = mean1,
      mean2 = mean2,
      dispersion = dispersion,
      nsims = nsims,
      data.frame = data.frame
    )
  }

  #-----------------------------------------------------------------------------
  # Return
  #-----------------------------------------------------------------------------
  res
}

grid_bnb <- function(
    n,
    mean1,
    mean2,
    ratio,
    dispersion,
    nsims,
    data.frame,
    max_zeros,
    ncores,
    ratio_keep
) {
  #-----------------------------------------------------------------------------
  # Unique combinations for simulating data.
  # Compute ratio in second step to filter unwanted rows for the case when
  # mean1 and ratio are provided in function.
  #-----------------------------------------------------------------------------
  grid_sim <- expand.grid(
    n1 = n,
    mean1 = mean1,
    mean2 = mean2,
    dispersion1 = dispersion,
    nsims = nsims,
    distribution = "Dependent two-sample BNB",
    stringsAsFactors = FALSE
  ) |>
    dplyr::mutate(ratio = round(mean2 / mean1, 5), .after = "mean2") |>
    {\(.) if(!is.null(ratio_keep)) {dplyr::filter(.data = ., ratio %in% ratio_keep)} else {.}}()

  #-----------------------------------------------------------------------------
  # Simulate data
  #-----------------------------------------------------------------------------
  if(ncores > 1L) {
    cluster <- multidplyr::new_cluster(ncores)
    multidplyr::cluster_library(cluster, 'depower')
  }

  # Simulate data
  res <- grid_sim |>
    dplyr::rowwise() |>
    {\(.) if(ncores > 1L) {multidplyr::partition(data = ., cluster = cluster)} else {.}}() |>
    dplyr::mutate(
      data = list(
        sim_bnb_two_sample(
          n = n1,
          mean1 = mean1,
          mean2 = mean2,
          dispersion = dispersion1,
          nsims = nsims,
          data.frame = data.frame
        )
      )
    ) |>
    {\(.) if(ncores > 1L) {dplyr::collect(x = .)} else {.}}() |>
    dplyr::ungroup() |>
    dplyr::mutate(n2 = n1, .after = "n1")

  # Check if simulated data is all zeros or if the vast majority of data is all
  # zeros. This primarily occurs if you've selected a small dispersion (<0.1)
  # and have a small sample size.
  if(any_zeros(res[["data"]], max_zeros)) {
    res <- res |>
      dplyr::rowwise() |>
      dplyr::mutate(
        data = list(data[not_zeros(data, max_zeros)]),
        nsims = length(data)
      ) |>
      dplyr::ungroup()
  }

  #-----------------------------------------------------------------------------
  # Prepare return
  #-----------------------------------------------------------------------------
  # Column labels
  vars <- c(
    "n1" = "n1",
    "n2" = "n2",
    "Mean1" = "mean1",
    "Mean2" = "mean2",
    "Ratio" = "ratio",
    "Dispersion" = "dispersion1",
    "N Simulations" = "nsims",
    "Distribution" = "distribution",
    "Data" = "data"
  )
  idx <- match(names(res), vars)
  if(anyNA(idx)) {stop("Unknown variable found while labeling data frame.")}
  for(i in seq_len(ncol(res))) {
    attr(res[[i]], "label") <- names(vars)[idx][i]
  }

  # Class
  class(res) <- c("depower", "bnb", class(res))

  #-----------------------------------------------------------------------------
  # Return
  #-----------------------------------------------------------------------------
  res
}

sim_bnb_two_sample <- function(
    n,
    mean1,
    mean2,
    dispersion,
    nsims,
    data.frame
) {
  #-----------------------------------------------------------------------------
  # Simulate
  #-----------------------------------------------------------------------------
  if(nsims > 1L) {
    res <- lapply(seq_len(nsims), function(x) {
      g <- rgamma(n = n, shape = dispersion, scale = 1 / dispersion)
      mean1 <- mean1 * g
      mean2 <- mean2 * g

      list(
        value1 = rpois(n = n, lambda = mean1),
        value2 = rpois(n = n, lambda = mean2)
      )
    })
    if(data.frame) {
      res <- lapply(res, list_to_df)
    }
  } else { # nsims == 1
    g <- rgamma(n = n, shape = dispersion, scale = 1 / dispersion)
    mean1 <- mean1 * g
    mean2 <- mean2 * g

    res <- list(
      value1 = rpois(n = n, lambda = mean1),
      value2 = rpois(n = n, lambda = mean2)
    )
    if(data.frame) {
      res <- list_to_df(res)
    }
  }

  #-----------------------------------------------------------------------------
  # Return
  #-----------------------------------------------------------------------------
  res
}

utils::globalVariables(c("n1", "dispersion1", "data"))