R/utility.R

Defines functions copy_psi_theta_phi resample_z_psi_theta_phi get_theta_phi_i get_z_i sample_u sample_z find_maxJ find_maxK predict_detect_probs_regional predict_detect_probs_local predict_pi .cutil_regional .cutil_local cutil_regional cutil_local eutil check_args_eval_util_R check_args_eval_util_L list_cond_R list_cond_L eval_util_R eval_util_L

Documented in eval_util_L eval_util_R list_cond_L list_cond_R

#' @title Expected utility for local species diversity assessments.
#'
#' @description `eval_util_L()` evaluates the expected utility of a local
#'  species diversity assessment by using Monte Carlo integration.
#' @details
#' The utility of local species diversity assessment for a given set of sites
#'  can be defined as the expected number of detected species per site
#'  (Fukaya et al. 2022). `eval_util_L()` evaluates this utility for arbitrary
#'  sets of sites that can potentially have different values for site occupancy
#'  status of species, \eqn{z}{z}, sequence capture probabilities of species,
#'  \eqn{\theta}{theta}, and sequence relative dominance of species,
#'  \eqn{\phi}{phi}, for the combination of `K` and `N` values specified in the
#'  `conditions` argument.
#'  Such evaluations can be used to balance `K` and `N` to maximize the utility
#'  under a constant budget (possible combinations of `K` and `N` under a
#'  specified budget and cost values are easily obtained using `list_cond_L()`;
#'  see the example below).
#'  It is also possible to examine how the utility varies with different `K`
#'  and `N` values without setting a budget level, which may be useful for determining
#'  a satisfactory level of `K` and `N` from a purely technical point of view.
#' The expected utility is defined as the expected value of the conditional
#'  utility in the form:
#'  \deqn{U(K, N \mid \boldsymbol{r}, \boldsymbol{u}) = \frac{1}{J}\sum_{j = 1}^{J}\sum_{i = 1}^{I}\left\{1 - \prod_{k = 1}^{K}\left(1 - \frac{u_{ijk}r_{ijk}}{\sum_{m = 1}^{I}u_{mjk}r_{mjk}} \right)^N \right\}}{U(K, N | r, u) = (1 / J) * sum_{j, i}((1 - \prod_{k}(1 - (u[i, j, k] * r[i, j, k])/sum(u[, j, k] * r[, j, k])))^N)}
#'  where \eqn{u_{ijk}}{u[i, j, k]} is a latent indicator variable representing
#'  the inclusion of the sequence of species \eqn{i}{i} in replicate \eqn{k}{k}
#'  at site \eqn{j}{j}, and \eqn{r_{ijk}}{r[i, j, k]} is a latent variable that
#'  is proportional to the relative frequency of the sequence of species
#'  \eqn{i}{i}, conditional on its presence in replicate \eqn{k}{k} at site
#'  \eqn{j}{j} (Fukaya et al. 2022).
#' Expectations are taken with respect to the posterior (or possibly prior)
#'  predictive distributions of \eqn{\boldsymbol{r} = \{r_{ijk}\}}{r} and
#'  \eqn{\boldsymbol{u} = \{u_{ijk}\}}{u}, which are evaluated numerically using
#'  Monte Carlo integration. The predictive distributions of
#'  \eqn{\boldsymbol{r}}{r} and \eqn{\boldsymbol{u}}{u} depend on the model
#'  parameters \eqn{z}{z}, \eqn{\theta}{theta}, and \eqn{\phi}{phi} values.
#'  Their posterior (or prior) distribution is specified by supplying an
#'  `occumbFit` object containing their posterior samples via the `fit` argument,
#'  or by supplying a matrix or array of posterior (or prior) samples of
#'  parameter values via the `z`, `theta`, and `phi` arguments. Higher
#'  approximation accuracy can be obtained by increasing the value of `N_rep`.
#'
#' The `eval_util_L()` function can be executed by supplying the `fit` argument
#'  without specifying the `z`, `theta`, and `phi` arguments, by supplying the
#'  three `z`, `theta`, and `phi` arguments without the `fit` argument, or by
#'  supplying the `fit` argument and any or all of the `z`, `theta`, and `phi`
#'  arguments. If `z`, `theta`, or `phi` arguments are specified in addition
#'  to the `fit`, the parameter values given in these arguments are used
#'  preferentially to evaluate the expected utility. If the sample sizes differ among
#'  parameters, parameters with smaller sample sizes are resampled with
#'  replacements to align the sample sizes across parameters.
#'
#' The expected utility is evaluated assuming homogeneity of replicates, in the
#'  sense that \eqn{\theta}{theta} and \eqn{\phi}{phi}, the model parameters
#'  associated with the species detection process, are constant across
#'  replicates within a site. For this reason, `eval_util_L()` does not accept
#'  replicate-specific \eqn{\theta}{theta} and \eqn{\phi}{phi}. If the
#'  `occumbFit` object supplied in the `fit` argument has a replicate-specific
#'  parameter, the parameter samples to be used in the utility evaluation must be
#'  provided explicitly via the `theta` or `phi` arguments.
#'
#' The Monte Carlo integration is executed in parallel on multiple CPU cores, where
#'  the `cores` argument controls the degree of parallelization.
#' @param settings A data frame that specifies a set of conditions under which
#'  utility is evaluated. It must include columns named `K` and `N`, which
#'  specify the number of replicates per site and the sequencing depth per
#'  replicate, respectively.
#'  `K` and `N` must be numeric vectors greater than 0. When `K` contains a
#'  decimal value, it is discarded and treated as an integer.
#'  Additional columns are ignored, but may be included.
#' @param fit An `occumbFit` object.
#' @param z Sample values of site occupancy status of species stored in an array
#'  with sample \eqn{\times}{*} species \eqn{\times}{*} site dimensions.
#' @param theta Sample values of sequence capture probabilities of species
#'  stored in a matrix with sample \eqn{\times}{*} species dimensions or an array
#'  with sample \eqn{\times}{*} species \eqn{\times}{*} site dimensions.
#' @param phi Sample values of sequence relative dominance of species stored in
#'  a matrix with sample \eqn{\times}{*} species dimensions or an array with
#'  sample \eqn{\times}{*} species \eqn{\times}{*} site dimensions.
#' @param N_rep Controls the sample size for the Monte Carlo integration.
#'  The integral is evaluated using `N_sample * N_rep` random samples,
#'  where `N_sample` is the maximum size of the MCMC sample in the `fit`
#'  argument and the parameter sample in the `z`, `theta`, and `phi` arguments.
#' @param cores The number of cores to use for parallelization.
#' @return A data frame with a column named `Utility` in which the estimates of the
#'  expected utility are stored. This is obtained by adding the `Utility` column
#'  to the data frame provided in the `settings` argument.
#' @section References:
#'      K. Fukaya, N. I. Kondo, S. S. Matsuzaki and T. Kadoya (2022)
#'      Multispecies site occupancy modelling and study design for spatially
#'      replicated environmental DNA metabarcoding. \emph{Methods in Ecology
#'      and Evolution} \strong{13}:183--193.
#'      \doi{10.1111/2041-210X.13732}
#' @examples
#' \donttest{
#' set.seed(1)
#'
#' # Generate a random dataset (20 species * 2 sites * 2 reps)
#' I <- 20 # Number of species
#' J <- 2  # Number of sites
#' K <- 2  # Number of replicates
#' data <- occumbData(
#'     y = array(sample.int(I * J * K), dim = c(I, J, K)))
#'
#' # Fitting a null model
#' fit <- occumb(data = data)
#'
#' ## Estimate expected utility
#' # Arbitrary K and N values
#' (util1 <- eval_util_L(expand.grid(K = 1:3, N = c(1E3, 1E4, 1E5)),
#'                       fit))
#'
#' # K and N values under specified budget and cost
#' (util2 <- eval_util_L(list_cond_L(budget = 1E5,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   fit),
#'                       fit))
#'
#' # K values restricted
#' (util3 <- eval_util_L(list_cond_L(budget = 1E5,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   fit,
#'                                   K = 1:5),
#'                       fit))
#'
#' # theta and phi values supplied
#' (util4 <- eval_util_L(list_cond_L(budget = 1E5,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   fit,
#'                                   K = 1:5),
#'                       fit,
#'                       theta = array(0.5, dim = c(4000, I, J)),
#'                       phi = array(1, dim = c(4000, I, J))))
#' 
#' # z, theta, and phi values, but no fit object supplied
#' (util5 <- eval_util_L(list_cond_L(budget = 1E5,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   fit,
#'                                   K = 1:5),
#'                       fit = NULL,
#'                       z = array(1, dim = c(4000, I, J)),
#'                       theta = array(0.5, dim = c(4000, I, J)),
#'                       phi = array(1, dim = c(4000, I, J))))
#' }
#' @export
eval_util_L <- function(settings,
                        fit = NULL,
                        z = NULL,
                        theta = NULL,
                        phi = NULL,
                        N_rep = 1,
                        cores = 1L) {

  # Validate arguments
  check_args_eval_util_L(settings, fit, z, theta, phi)

  # Set parameter values
  if (is.null(z)) {
    z <- get_post_samples(fit, "z")
  }

  if (is.null(theta)) {
    theta <- get_post_samples(fit, "theta")
  }

  if (is.null(phi)) {
    phi <- get_post_samples(fit, "phi")
  }

  # Determine site dimension
  has_site_dim <- c(TRUE,
                    length(dim(theta)) == 3,
                    length(dim(phi))   == 3)

  # Resampling to match sample size
  n_samples <- c(dim(z)[1], dim(theta)[1], dim(phi)[1])

  if (n_samples[1] < max(n_samples)) {
    z <- resample_z_psi_theta_phi(z, has_site_dim[1], max(n_samples))
  }
  if (n_samples[2] < max(n_samples)) {
    theta <- resample_z_psi_theta_phi(theta, has_site_dim[2], max(n_samples))
  }
  if (n_samples[3] < max(n_samples)) {
    phi <- resample_z_psi_theta_phi(phi, has_site_dim[3], max(n_samples))
  }

  # Adapt theta/phi when they are species-specific
  if (!has_site_dim[2]) {
    theta <- outer(theta, rep(1, dim(z)[3]))
  }
  if (!has_site_dim[3]) {
    phi <- outer(phi, rep(1, dim(z)[3]))
  }

  # Calculate expected utility
  result <- rep(NA, nrow(settings))
  for (i in seq_len(nrow(settings))) {
    result[i] <- eutil(z = z, theta = theta, phi = phi,
                       K = settings[i, "K"], N = settings[i, "N"],
                       scale = "local",
                       N_rep = N_rep, cores = cores)
  }

  # Output
  out <- cbind(settings, result)
  colnames(out)[ncol(out)] <- "Utility"
  out
}

#' @title Expected utility for regional species diversity assessments.
#'
#' @description `eval_util_R()` evaluates the expected utility of a regional
#'  species diversity assessment using Monte Carlo integration.
#' @details
#' The utility of a regional species diversity assessment can be defined as
#'  the number of species expected to be detected in the region of interest
#'  (Fukaya et al. 2022). `eval_util_R()` evaluates this utility for the region
#'  modeled in the `occumbFit` object for the combination of `J`, `K`, and `N`
#'  values specified in the `conditions` argument.
#'  Such evaluations can be used to balance `J`, `K`, and `N` to maximize the
#'  utility under a constant budget (possible combinations of `J`, `K`, and `N`
#'  under a specified budget and cost values are easily obtained using
#'  `list_cond_R()`; see the example below).
#'  It is also possible to examine how the utility varies with different `J`,
#'  `K`, and `N` values without setting a budget level, which may be useful in determining
#'  the satisfactory levels of `J`, `K`, and `N` from a purely technical point of
#'  view.
#' 
#' The expected utility is defined as the expected value of the conditional
#'  utility in the form:
#'  \deqn{U(J, K, N \mid \boldsymbol{r}, \boldsymbol{u}) = \sum_{i = 1}^{I}\left\{1 - \prod_{j = 1}^{J}\prod_{k = 1}^{K}\left(1 - \frac{u_{ijk}r_{ijk}}{\sum_{m = 1}^{I}u_{mjk}r_{mjk}} \right)^N \right\}}{U(J, K, N | r, u) = sum_{i}((1 - \prod_{j}\prod_{k}(1 - (u[i, j, k] * r[i, j, k])/sum(u[, j, k] * r[, j, k])))^N)}
#'  where \eqn{u_{ijk}}{u[i, j, k]} is a latent indicator variable representing
#'  the inclusion of the sequence of species \eqn{i}{i} in replicate \eqn{k}{k}
#'  at site \eqn{j}{j}, and \eqn{r_{ijk}}{r[i, j, k]} is a latent variable that
#'  is proportional to the relative frequency of the sequence of species
#'  \eqn{i}{i}, conditional on its presence in replicate \eqn{k}{k} at site
#'  \eqn{j}{j} (Fukaya et al. 2022).
#' Expectations are taken with respect to the posterior (or possibly prior)
#'  predictive distributions of \eqn{\boldsymbol{r} = \{r_{ijk}\}}{r} and
#'  \eqn{\boldsymbol{u} = \{u_{ijk}\}}{u}, which are evaluated numerically using
#'  Monte Carlo integration. The predictive distributions of
#'  \eqn{\boldsymbol{r}}{r} and \eqn{\boldsymbol{u}}{u} depend on the model
#'  parameters \eqn{\psi}{psi}, \eqn{\theta}{theta}, and \eqn{\phi}{phi} values.
#'  Their posterior (or prior) distribution is specified by supplying an
#'  `occumbFit` object containing their posterior samples via the `fit` argument,
#'  or by supplying a matrix or array of posterior (or prior) samples of
#'  parameter values via the `psi`, `theta`, and `phi` arguments. Higher
#'  approximation accuracy can be obtained by increasing the value of `N_rep`.
#' 
#' The `eval_util_R()` function can be executed by supplying the `fit` argument
#'  without specifying the `psi`, `theta`, and `phi` arguments, by supplying the
#'  three `psi`, `theta`, and `phi` arguments without the `fit` argument, or by
#'  supplying the `fit` argument and any or all of the `psi`, `theta`, and `phi`
#'  arguments. If the `psi`, `theta`, or `phi` arguments are specified in addition
#'  to the `fit`, the parameter values given in these arguments are preferentially
#'  used to evaluate the expected utility. If the sample sizes differed among
#'  parameters, parameters with smaller sample sizes are resampled with
#'  replacements to align the sample sizes across parameters.
#'
#' The expected utility is evaluated assuming homogeneity of replicates, in the
#'  sense that \eqn{\theta}{theta} and \eqn{\phi}{phi}, the model parameters
#'  associated with the species detection process, are constant across
#'  replicates within a site. For this reason, `eval_util_R()` does not accept
#'  replicate-specific \eqn{\theta}{theta} and \eqn{\phi}{phi}. If the
#'  `occumbFit` object supplied in the `fit` argument has a replicate-specific
#'  parameter, the parameter samples to be used in the utility evaluation must be
#'  provided explicitly via the `theta` or `phi` arguments.
#'
#' If the parameters are modeled as a function of site covariates in the `fit`
#'  object, or if the `psi`, `theta`, and/or `phi` arguments have site dimensions,
#'  the expected utility is evaluated to account for the site heterogeneity of
#'  the parameters. To incorporate site heterogeneity, the
#'  parameter values for each `J` site are determined by selecting site-specific
#'  parameter values in the `fit`, or those supplied in `psi`, `theta`, and `phi`
#'  via random sampling with replacement. Thus, expected utility is
#'  evaluated by assuming a set of supplied parameter values as a statistical
#'  population of site-specific parameters.
#'
#' The Monte Carlo integration is executed in parallel on multiple CPU cores, where
#'  the `cores` argument controls the degree of parallelization.
#' @param settings A data frame that specifies a set of conditions under which
#'  utility is evaluated. It must include columns named `J`, `K`, and `N`,
#'  which specify the number of sites, number of replicates per site, and
#'  sequencing depth per replicate, respectively.
#'  `J`, `K`, and `N` must be numeric vectors greater than 0. When `J` and `K`
#'  contain decimal values, they are discarded and treated as integers.
#'  Additional columns are ignored, but may be included.
#' @param fit An `occumbFit` object.
#' @param psi Sample values of the site occupancy probabilities of species
#'  stored in a matrix with sample \eqn{\times}{*} species dimensions or an array
#'  with sample \eqn{\times}{*} species \eqn{\times}{*} site dimensions.
#' @param theta Sample values of sequence capture probabilities of species
#'  stored in a matrix with sample \eqn{\times}{*} species dimensions or an array
#'  with sample \eqn{\times}{*} species \eqn{\times}{*} site dimensions.
#' @param phi Sample values of sequence relative dominance of species stored in
#'  a matrix with sample \eqn{\times}{*} species dimensions or an array with
#'  sample \eqn{\times}{*} species \eqn{\times}{*} site dimensions.
#' @param N_rep Controls the sample size for the Monte Carlo integration.
#'  The integral is evaluated using a total of `N_sample * N_rep` random samples,
#'  where `N_sample` is the maximum size of the MCMC sample in the `fit`
#'  argument and the parameter sample in the `psi`, `theta`, and `phi` arguments.
#' @param cores The number of cores to use for parallelization.
#' @return A data frame with a column named `Utility` in which the estimates of
#'  the expected utility are stored. This is obtained by adding the `Utility` column
#'  to the data frame provided in the `settings` argument.
#' @section References:
#'      K. Fukaya, N. I. Kondo, S. S. Matsuzaki and T. Kadoya (2022)
#'      Multispecies site occupancy modelling and study design for spatially
#'      replicated environmental DNA metabarcoding. \emph{Methods in Ecology
#'      and Evolution} \strong{13}:183--193.
#'      \doi{10.1111/2041-210X.13732}
#' @examples
#' \donttest{
#' set.seed(1)
#'
#' # Generate a random dataset (20 species * 2 sites * 2 reps)
#' I <- 20 # Number of species
#' J <- 2  # Number of sites
#' K <- 2  # Number of replicates
#' data <- occumbData(
#'     y = array(sample.int(I * J * K), dim = c(I, J, K)))
#'
#' # Fitting a null model
#' fit <- occumb(data = data)
#'
#' ## Estimate expected utility
#' # Arbitrary J, K, and N values
#' (util1 <- eval_util_R(expand.grid(J = 1:3, K = 1:3, N = c(1E3, 1E4, 1E5)),
#'                       fit))
#'
#' # J, K, and N values under specified budget and cost
#' (util2 <- eval_util_R(list_cond_R(budget = 50000,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   lambda3 = 5000),
#'                       fit))
#'
#' # K values restricted
#' (util3 <- eval_util_R(list_cond_R(budget = 50000,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   lambda3 = 5000,
#'                                   K = 1:5),
#'                       fit))
#'
#' # J and K values restricted
#' (util4 <- eval_util_R(list_cond_R(budget = 50000,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   lambda3 = 5000,
#'                                   J = 1:3, K = 1:5),
#'                       fit))
#'
#' # theta and phi values supplied
#' (util5 <- eval_util_R(list_cond_R(budget = 50000,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   lambda3 = 5000,
#'                                   J = 1:3, K = 1:5),
#'                       fit,
#'                       theta = array(0.5, dim = c(4000, I, J)),
#'                       phi = array(1, dim = c(4000, I, J))))
#' 
#' # psi, theta, and phi values, but no fit object supplied
#' (util6 <- eval_util_R(list_cond_R(budget = 50000,
#'                                   lambda1 = 0.01,
#'                                   lambda2 = 5000,
#'                                   lambda3 = 5000,
#'                                   J = 1:3, K = 1:5),
#'                       fit = NULL,
#'                       psi = array(0.9, dim = c(4000, I, J)),
#'                       theta = array(0.9, dim = c(4000, I, J)),
#'                       phi = array(1, dim = c(4000, I, J))))
#' }
#' @export
eval_util_R <- function(settings,
                        fit = NULL,
                        psi = NULL,
                        theta = NULL,
                        phi = NULL,
                        N_rep = 1,
                        cores = 1L) {

  # Validate arguments
  check_args_eval_util_R(settings, fit, psi, theta, phi)

  # Set parameter values
  if (is.null(psi)) {
    psi <- get_post_samples(fit, "psi")
  }

  if (is.null(theta)) {
    theta <- get_post_samples(fit, "theta")
  }

  if (is.null(phi)) {
    phi <- get_post_samples(fit, "phi")
  }

  # Determine site dimension
  has_site_dim <- c(length(dim(psi))   == 3,
                    length(dim(theta)) == 3,
                    length(dim(phi))   == 3)
  N_site <-
    if (has_site_dim[1]) {
      dim(psi)[3]
    } else if (has_site_dim[2]) {
      dim(theta)[3]
    } else if (has_site_dim[3]) {
      dim(phi)[3]
    }

  # Resampling to match sample size
  n_samples <- c(dim(psi)[1], dim(theta)[1], dim(phi)[1])

  if (n_samples[1] < max(n_samples)) {
    psi <- resample_z_psi_theta_phi(psi, has_site_dim[1], max(n_samples))
  }
  if (n_samples[2] < max(n_samples)) {
    theta <- resample_z_psi_theta_phi(theta, has_site_dim[2], max(n_samples))
  }
  if (n_samples[3] < max(n_samples)) {
    phi <- resample_z_psi_theta_phi(phi, has_site_dim[3], max(n_samples))
  }

  # Make N_rep copies of psi, theta, and phi
  if (N_rep > 1) {
    psi   <- copy_psi_theta_phi(psi, has_site_dim[1], N_rep)
    theta <- copy_psi_theta_phi(theta, has_site_dim[2], N_rep)
    phi   <- copy_psi_theta_phi(phi, has_site_dim[3], N_rep)
  }

  # Calculate expected utility
  result <- rep(NA, nrow(settings))
  for (i in seq_len(nrow(settings))) {
    # Random sampling of sites
    J <- settings[i, "J"]
    site_use <- matrix(nrow = dim(psi)[1], ncol = J)
    if (any(has_site_dim)) {
      for (n in seq_len(nrow(site_use))) {
        site_use[n, ] <- sample.int(N_site, J, replace = TRUE)
      }
    }

    # Prepare z, theta, and phi (dim = N_sample * N_species * N_site)
    z_i     <- get_z_i(psi, has_site_dim[1], site_use)
    theta_i <- get_theta_phi_i(theta, has_site_dim[2], site_use)
    phi_i   <- get_theta_phi_i(phi, has_site_dim[3], site_use)

    result[i] <- eutil(z = z_i, theta = theta_i, phi = phi_i,
                       K = settings[i, "K"], N = settings[i, "N"],
                       scale = "regional",
                       N_rep = 1, cores = cores)
  }

  # Output
  out <- cbind(settings, result)
  colnames(out)[ncol(out)] <- "Utility"
  out
}

#' @title Conditions for local assessment under certain budget and cost values.
#' @description `list_cond_L()` constructs a list of possible local species
#'  diversity assessment conditions under the specified budget and cost values.
#' @details
#'   This function can generate a data frame object to be given to the
#'  `settings` argument of `eval_util_L()`; see Examples of `eval_util_L()`.
#'  By default, it outputs a list of all feasible combinations of values for the
#'  number of replicates per site `K` and the sequencing depth per replicate
#'  `N` based on the given budget, cost values, and number of sites
#'  (identified by reference to the `fit` object). The resulting `N` can be a
#'  non-integer because it is calculated simply by assuming that the maximum
#'  value can be obtained. To obtain a list for only a subset of the
#'  possible `K` values under a given budget and cost value, the `K`
#'  argument is used to provide a vector of the desired `K` values.
#' @param budget A numeric specifying budget amount. The currency unit
#'  is arbitrary but must be consistent with that of `lambda1` and `lambda2`.
#' @param lambda1 A numeric specifying the cost per sequence read for
#'  high-throughput sequencing. The currency unit is arbitrary but must be
#'  consistent with that of `budget` and `lambda2`.
#' @param lambda2 A numeric specifying the cost per replicate for library
#'  preparation. The currency unit is arbitrary but must be consistent with that
#'  of `budget` and `lambda1`.
#' @param fit An `occumbFit` object.
#' @param K An optional vector for manually specifying the number of replicates.
#' @return A data frame containing columns named `budget`, `lambda1`, `lambda2`,
#'  `K`, and `N`.
#' @export
list_cond_L <- function(budget, lambda1, lambda2, fit, K = NULL) {

  ## Validate arguments
  # Assert that budget and cost values are positive.
  if (!checkmate::test_numeric(budget, lower = 0)) {
    stop("Negative 'budget' value.\n")
  }
  if (!checkmate::test_numeric(lambda1, lower = 0)) {
    stop("Negative 'lambda1' value.\n")
  }
  if (!checkmate::test_numeric(lambda2, lower = 0)) {
    stop("Negative 'lambda2' value.\n")
  }

  # Assert that fit is an occumbFit object
  assert_occumbFit(fit)

  # Determine the number of sites
  J <- dim(get_post_samples(fit, "z"))[3]

  # Determine max_K under given budget, cost, and the number of sites
  max_K <- floor(budget / (lambda2 * J))

  # Assert that given settings ensure at least one replicate per site
  if (!checkmate::test_numeric(max_K, lower = 1)) {
    stop("Impossible to have > 0 replicates per site under the given budget, cost, and the number of sites.\n")
  }

  if (is.null(K)) {
    # Determine K values
    K <- seq_len(max_K)[budget - lambda2 * J * seq_len(max_K) > 0]
  } else {
    # Assert that K >= 1
    if (!checkmate::test_numeric(K, lower = 1)) {
      stop("'K' contains values less than one.\n")
    }

    # Assert that the all given K are feasible
    if (any(!budget - lambda2 * J * K > 0)) {
      stop(paste("A value of 'K' greater than",
                 max_K,
                 "is not feasible under the given budget, cost, and the number of sites.\n"))
    }
  }

  ## Output a table of conditions
  out <- cbind(rep(budget, length(K)),
               rep(lambda1, length(K)),
               rep(lambda2, length(K)),
               K,
               (budget - lambda2 * J * K) / (lambda1 * J * K))
  colnames(out) <- c("budget", "lambda1", "lambda2", "K", "N")
  data.frame(out)
}

#' @title Conditions for regional assessment under certain budget and cost values.
#' @description `list_cond_R()` constructs a list of possible regional species
#'  diversity assessment conditions under the specified budget and cost values.
#' @details
#'   This function can generate a data frame object to be given to the
#'  `settings` argument of `eval_util_R()`; see Examples of `eval_util_R()`.
#'  By default, it outputs a list of all feasible combinations of values for the
#'  number of sites `J`, number of replicates per site `K`, and sequencing
#'  depth per replicate `N` based on the given budget and cost values.
#'  The resulting `N` can be a non-integer because it is calculated simply by
#'  assuming that the maximum value can be obtained.
#'  If one wants to obtain a list for only a subset of the possible values of `J`
#'  and `K` under a given budget and cost value, use the `J` and/or `K`
#'  arguments (in fact, it is recommended that a relatively small number of `K`
#'  values be specified using the `K` argument because the list of all
#'  conditions achievable under moderate budget and cost values can be large, and
#'  it is rarely practical to have a vast number of replicates per site). If a
#'  given combination of `J` and `K` values is not feasible under the specified
#'  budget and cost values, the combination will be ignored and excluded from
#'  the output.
#' @param budget A numeric specifying budget amount. The currency unit is
#'  arbitrary but must be consistent with that of `lambda1`, `lambda2`, and `lambda3`.
#' @param lambda1 A numeric specifying the cost per sequence read for
#'  high-throughput sequencing. The currency unit is arbitrary but must be
#'  consistent with that of `budget`, `lambda2`, and `lambda3`.
#' @param lambda2 A numeric specifying the cost per replicate for library
#'  preparation. The currency unit is arbitrary but must be consistent with that
#'  of `budget`, `lambda1`, and `lambda3`.
#' @param lambda3 A numeric specifying the visiting cost per site. The currency
#'  unit is arbitrary but must be consistent with that of `budget`, `lambda1`,
#'  and `lambda2`.
#' @param J An optional vector for manually specifying the number of sites
#' @param K An optional vector used to specify the number of replicates manually.
#'  For computational convenience, the `K` values must be in ascending order.
#' @return A data frame containing columns named `budget`, `lambda1`, `lambda2`,
#'  `lambda3`, `J`, `K`, and `N`.
#' @export
list_cond_R <- function(budget, lambda1, lambda2, lambda3, J = NULL, K = NULL) {

  ## Validate arguments
  # Assert that budget and cost values are positive.
  if (!checkmate::test_numeric(budget, lower = 0)) {
    stop("Negative 'budget' value.\n")
  }
  if (!checkmate::test_numeric(lambda1, lower = 0)) {
    stop("Negative 'lambda1' value.\n")
  }
  if (!checkmate::test_numeric(lambda2, lower = 0)) {
    stop("Negative 'lambda2' value.\n")
  }
  if (!checkmate::test_numeric(lambda3, lower = 0)) {
    stop("Negative 'lambda3' value.\n")
  }

  # Assert that J, K >= 1
  if (!is.null(J) && !checkmate::test_numeric(J, lower = 1)) {
    stop("'J' contains values less than one.\n")
  }
  if (!is.null(K) && !checkmate::test_numeric(K, lower = 1)) {
    stop("'K' contains values less than one.\n")
  }

  # Assert that K is in ascending order
  if (!is.null(K) && !identical(K, sort(K))) {
    stop("'K' must be in ascending order.\n")
  }

  # Determine the combination of J and K to be used
  if (is.null(J)) {
    max_J <- find_maxJ(budget, lambda2, lambda3)
    if (!max_J) {
      stop("No valid combination of 'J' and 'K' under the given budget and cost.\n")
    }
    J <- seq_len(max_J)
  }
  if (is.null(K)) {
    max_K <- find_maxK(budget, lambda2, lambda3)
    if (!max_K) {
      stop("No valid combination of 'J' and 'K' under the given budget and cost.\n")
    }
    K <- seq_len(max_K)
  }

  J_valid <- K_valid <- vector()
  for (k in K) {
    J_valid_k <- J[budget - lambda2 * J * k - lambda3 * J > 0]
    if (length(J_valid_k) > 0) {
      J_valid <- c(J_valid, J_valid_k)
      K_valid <- c(K_valid, rep(k, length(J_valid_k)))
    } else {
      break
    }
  }

  # Assert that given settings ensure at least one valid combination of J and K
  if (!length(J_valid) > 0) {
    stop("No valid combination of 'J' and 'K' under the given budget and cost.\n")
  }

  ## Output a table of conditions
  out <- cbind(rep(budget, length(J_valid)),
               rep(lambda1, length(J_valid)),
               rep(lambda2, length(J_valid)),
               rep(lambda3, length(J_valid)),
               J_valid,
               K_valid,
               (budget - lambda2 * J_valid * K_valid - lambda3 * J_valid) / (lambda1 * J_valid * K_valid))
  colnames(out) <- c("budget", "lambda1", "lambda2", "lambda3", "J", "K", "N")
  data.frame(out)
}

check_args_eval_util_L <- function(settings, fit, z, theta, phi) {
  # Assert that settings is a data frame and contains the required columns
  checkmate::assert_data_frame(settings)
  if (!checkmate::testSubset("K", names(settings))) {
    stop("The 'settings' argument does not contain column 'K'.\n")
  }
  if (!checkmate::testSubset("N", names(settings))) {
    stop("The 'settings' argument does not contain column 'N'.\n")
  }
  if (!checkmate::test_numeric(settings[, "K"], lower = 1)) {
    stop("'K' contains values less than one.\n")
  }
  if (!checkmate::test_numeric(settings[, "N"], lower = 1)) {
    stop("'N' contains values less than one.\n")
  }

  # Assert that either fit or (z, theta, phi) is provided
  if (is.null(fit) && !(!is.null(z) && !is.null(theta) && !is.null(phi))) {
    stop("Parameter values are not fully specified: use fit argument or otherwise use all of z, theta, phi arguments.\n")
  }

  if (!is.null(fit)) {
    # Assert that fit is an occumbFit object
    assert_occumbFit(fit)

    # Stop when modeled parameters are replicate-specific
    if (length(dim(get_post_samples(fit, "theta"))) == 4 && is.null(theta)) {
      stop("'fit' contains replicate-specific theta: specify appropriate theta values via the 'theta' argument to run.\n")
    }
    if (length(dim(get_post_samples(fit, "phi"))) == 4 && is.null(phi)) {
      stop("'fit' contains replicate-specific phi: specify appropriate phi values via the 'phi' argument to run.\n")
    }
  }

  # Assert that z, theta, and phi have an appropriate dimension and elements
  checkmate::assert_array(z, d = 3, null.ok = TRUE)
  checkmate::assert_array(theta, min.d = 2, max.d = 3, null.ok = TRUE)
  checkmate::assert_array(phi, min.d = 2, max.d = 3, null.ok = TRUE)
  checkmate::assert_integerish(z, lower = 0, upper = 1,
                               any.missing = FALSE, null.ok = TRUE)
  checkmate::assert_numeric(theta, lower = 0, upper = 1,
                            any.missing = FALSE, null.ok = TRUE)
  checkmate::assert_numeric(phi, lower = 0, any.missing = FALSE, null.ok = TRUE)

  # Assert that z does not have sites occupied by no species
  if (!is.null(z)) {
    z_allzero <- apply(z, c(1, 3), function(x) sum(x) == 0)
    if (any(z_allzero)) {
      stop(sprintf("The given 'z' array contains case(s) where no species occupy a site; see, for example, that z[%s, , %s] is a zero vector",
                   which(z_allzero, arr.ind = TRUE)[1, 1],
                   which(z_allzero, arr.ind = TRUE)[1, 2]))
    }
  }

  # Assert equality in species/site dimensions of z, theta, phi, and fit
  if (!is.null(fit)) {
    I <- dim(fit@data@y)[1]
    J <- dim(fit@data@y)[2]

    if (!is.null(z)) {
      if (dim(z)[2] != I) {
        stop(paste0("Mismatch in species dimension: dim(z)[2] must be ", I, ".\n"))
      }
      if (dim(z)[3] != J) {
        stop(paste0("Mismatch in site dimension: dim(z)[3] must be ", J, ".\n"))
      }
    }
    if (!is.null(theta)) {
      if (dim(theta)[2] != I) {
        stop(paste0("Mismatch in species dimension: dim(theta)[2] must be ", I, ".\n"))
      }
      if (length(dim(theta)) == 3 && dim(theta)[3] != J) {
        stop(paste0("Mismatch in site dimension: dim(theta)[3] must be ", J, ".\n"))
      }
    }
    if (!is.null(phi)) {
      if (dim(phi)[2] != I) {
        stop(paste0("Mismatch in species dimension: dim(phi)[2] must be ", I, ".\n"))
      }
      if (length(dim(phi)) == 3 && dim(phi)[3] != J) {
        stop(paste0("Mismatch in site dimension: dim(phi)[3] must be ", J, ".\n"))
      }
    }
  } else {
    has_site_dim <- c(TRUE, length(dim(theta)) == 3, length(dim(phi)) == 3)
    vI <- c(dim(z)[2], dim(theta)[2], dim(phi)[2])
    vJ <- c(dim(z)[3],
            ifelse(has_site_dim[2], dim(theta)[3], NA),
            ifelse(has_site_dim[3], dim(phi)[3], NA))
    vJ <- vJ[has_site_dim]
    terms <- c("dim(z)[3]", "dim(theta)[3]", "dim(phi)[3]")[has_site_dim]

    if (length(unique(vI)) != 1) {
      stop("Mismatch in species dimension: dim(z)[2], dim(theta)[2], and dim(phi)[2] must be equal.\n")
    }
    if (length(unique(vJ)) != 1) {
      stop(paste0("Mismatch in site dimension: ",
                  knitr::combine_words(terms), " must be equal.\n"))
    }
  }

  invisible(NULL)
}

check_args_eval_util_R <- function(settings, fit, psi, theta, phi) {
  # Assert that settings is a data frame and contains the required columns
  checkmate::assert_data_frame(settings)
  if (!checkmate::testSubset("J", names(settings))) {
    stop("The 'settings' argument does not contain column 'J'.\n")
  }
  if (!checkmate::testSubset("K", names(settings))) {
    stop("The 'settings' argument does not contain column 'K'.\n")
  }
  if (!checkmate::testSubset("N", names(settings))) {
    stop("The 'settings' argument does not contain column 'N'.\n")
  }
  if (!checkmate::test_numeric(settings[, "J"], lower = 1)) {
    stop("'J' contains values less than one.\n")
  }
  if (!checkmate::test_numeric(settings[, "K"], lower = 1)) {
    stop("'K' contains values less than one.\n")
  }
  if (!checkmate::test_numeric(settings[, "N"], lower = 1)) {
    stop("'N' contains values less than one.\n")
  }

  # Assert that either fit or (psi, theta, phi) is provided
  if (is.null(fit) && !(!is.null(psi) && !is.null(theta) && !is.null(phi))) {
    stop("Parameter values are not fully specified: use fit argument or otherwise use all of psi, theta, phi arguments.\n")
  }

  if (!is.null(fit)) {
    # Assert that fit is an occumbFit object
    assert_occumbFit(fit)

    # Stop when modeled parameters are replicate-specific
    if (length(dim(get_post_samples(fit, "theta"))) == 4 && is.null(theta)) {
      stop("'fit' contains replicate-specific theta: specify appropriate theta values via the 'theta' argument to run.\n")
    }
    if (length(dim(get_post_samples(fit, "phi"))) == 4 && is.null(phi)) {
      stop("'fit' contains replicate-specific phi: specify appropriate phi values via the 'phi' argument to run.\n")
    }
  }

  # Assert that psi, theta, and phi have an appropriate dimension and elements
  checkmate::assert_array(psi, min.d = 2, max.d = 3, null.ok = TRUE)
  checkmate::assert_array(theta, min.d = 2, max.d = 3, null.ok = TRUE)
  checkmate::assert_array(phi, min.d = 2, max.d = 3, null.ok = TRUE)
  checkmate::assert_numeric(psi, lower = 0, upper = 1,
                            any.missing = FALSE, null.ok = TRUE)
  checkmate::assert_numeric(theta, lower = 0, upper = 1,
                            any.missing = FALSE, null.ok = TRUE)
  checkmate::assert_numeric(phi, lower = 0, any.missing = FALSE, null.ok = TRUE)

  # Assert equality in species/site dimensions of psi, theta, phi, and fit
  if (!is.null(fit)) {
    I <- dim(fit@data@y)[1]
    J <- dim(fit@data@y)[2]

    if (!is.null(psi)) {
      if (dim(psi)[2] != I) {
        stop(paste0("Mismatch in species dimension: dim(psi)[2] must be ", I, ".\n"))
      }
      if (length(dim(psi)) == 3 && dim(psi)[3] != J) {
        stop(paste0("Mismatch in site dimension: dim(psi)[3] must be ", J, ".\n"))
      }
    }
    if (!is.null(theta)) {
      if (dim(theta)[2] != I) {
        stop(paste0("Mismatch in species dimension: dim(theta)[2] must be ", I, ".\n"))
      }
      if (length(dim(theta)) == 3 && dim(theta)[3] != J) {
        stop(paste0("Mismatch in site dimension: dim(theta)[3] must be ", J, ".\n"))
      }
    }
    if (!is.null(phi)) {
      if (dim(phi)[2] != I) {
        stop(paste0("Mismatch in species dimension: dim(phi)[2] must be ", I, ".\n"))
      }
      if (length(dim(phi)) == 3 && dim(phi)[3] != J) {
        stop(paste0("Mismatch in site dimension: dim(phi)[3] must be ", J, ".\n"))
      }
    }
  } else {
    has_site_dim <- c(length(dim(psi)) == 3,
                      length(dim(theta)) == 3,
                      length(dim(phi)) == 3)
    vI <- c(dim(psi)[2], dim(theta)[2], dim(phi)[2])
    vJ <- c(dim(psi)[3],
            ifelse(has_site_dim[2], dim(theta)[3], NA),
            ifelse(has_site_dim[3], dim(phi)[3], NA))
    vJ <- vJ[has_site_dim]
    terms <- c("dim(psi)[3]", "dim(theta)[3]", "dim(phi)[3]")[has_site_dim]

    if (length(unique(vI)) > 1) {
      stop("Mismatch in species dimension: dim(psi)[2], dim(theta)[2], and dim(phi)[2] must be equal.\n")
    }
    if (length(unique(vJ)) > 1) {
      stop(paste0("Mismatch in site dimension: ",
                  knitr::combine_words(terms), " must be equal.\n"))
    }
  }

  invisible(NULL)
}

# @title Monte-Carlo integration to obtain expected utility.
# @param z A species presence-absence array
#   (dim = N_sample * N_species * N_sites).
# @param theta A sequence capture probability array
#   (dim = N_sample * N_species * N_sites).
# @param phi A relative sequence dominance array
#   (dim = N_sample, N_species * N_sites).
# @param K The number of replicates (integer).
# @param N Sequence depth (numeric).
# @param scale Spatial scale to evaluate detection effectiveness.
# @param N_rep Controls the sample size for Monte Carlo simulation.
#   The integral is evaluated using a total of N_sample * N_rep random samples.
# @param core The number of cores to use for parallelization.
# @return The expected utility.
eutil <- function(z, theta, phi, K, N, scale = c("local", "regional"),
                  N_rep = N_rep, cores = cores) {

  M <- dim(z)[1]

  if (match.arg(scale) == "local") {
    fun <- .cutil_local
  } else if (match.arg(scale) == "regional") {
    fun <- .cutil_regional
  }

  if (cores == 1) {
    util_rep <- unlist(
      lapply(X = rep(seq_len(M), each = N_rep),
             FUN = fun,
             args = list(z = z, theta = theta, phi = phi, K = K, N = N))
    )
  } else {
    if (.Platform$OS.type == "windows") {
      # On Windows use makePSOCKcluster() and parLapply() for multiple cores
      cl <- parallel::makePSOCKcluster(cores)
      parallel::clusterEvalQ(cl, library(occumb))
      on.exit(parallel::stopCluster(cl))
      util_rep <- unlist(
        parallel::parLapply(cl = cl,
                            X = rep(seq_len(M), each = N_rep),
                            fun = fun,
                            args = list(z = z,
                                        theta = theta,
                                        phi = phi,
                                        K = K,
                                        N = N))
      )
    } else {
      # On Mac or Linux use mclapply() for multiple cores
      result <-
        parallel::mclapply(mc.cores = cores,
                           X = rep(seq_len(M), each = N_rep),
                           FUN = fun,
                           args = list(z = z,
                                       theta = theta,
                                       phi = phi,
                                       K = K,
                                       N = N))
      is_error <- sapply(result, inherits, "try-error")
      if (any(is_error)) {
        stop(result[is_error][1])
      } else {
        util_rep <- unlist(result)
      }
    }
  }

  if (any(is.nan(util_rep))) {
    warning("Case(s) arose in the replicated simulation where 'Utility' could not be calculated and were ignored. This result may sometimes occur stochastically; try repeat running to see if the same warning occurs. If the same result occurs frequently, the given 'theta' or 'phi' values might need to be higher.")
  }
  mean(util_rep, na.rm = TRUE)
}

# @title Conditional utility function for local species diversity assessments.
# @param z A species presence-absence matrix (dim = N_species * N_sites).
# @param theta A sequence capture probability matrix (dim = N_species * N_sites).
# @param phi A relative sequence dominance matrix (dim = N_species * N_sites).
# @param K The number of replicates (integer).
# @param N Sequence depth (numeric).
# @return The expected number of detected species per site.
cutil_local <- function(z, theta, phi, K, N) {
  pi <- predict_pi(z, theta, phi, K)
  detect_probs <- predict_detect_probs_local(pi, N)
  return(sum(detect_probs) / ncol(z))
}

# @title Conditional utility function for regional species diversity assessments.
# @param z A species presence-absence matrix (dim = N_species * N_sites).
# @param theta A sequence capture probability matrix (dim = N_species * N_sites).
# @param phi A relative sequence dominance matrix (dim = N_species * N_sites).
# @param K The number of replicates (integer).
# @param N Sequence depth (numeric).
# @return The expected total number of species detected over the all sites.
cutil_regional <- function(z, theta, phi, K, N) {
  pi <- predict_pi(z, theta, phi, K)
  detect_probs <- predict_detect_probs_regional(pi, N)
  return(sum(detect_probs))
}

# Auxiliary functions adapting cutil to lapply
.cutil_local <- function(n, args) {
  if (dim(args$z)[2] == 1) {          # When I = 1
    cutil_local(matrix(args$z[n, , ], nrow = 1),
                matrix(args$theta[n, , ], nrow = 1),
                matrix(args$phi[n, , ], nrow = 1),
                args$K,
                args$N)
  } else if (dim(args$z)[3] == 1) {   # When J = 1
    cutil_local(matrix(args$z[n, , ], ncol = 1),
                matrix(args$theta[n, , ], ncol = 1),
                matrix(args$phi[n, , ], ncol = 1),
                args$K,
                args$N)
  } else {
    cutil_local(args$z[n, , ],
                args$theta[n, , ],
                args$phi[n, , ],
                args$K,
                args$N)
  }
}

.cutil_regional <- function(n, args) {
  if (dim(args$z)[2] == 1) {          # When I = 1
    cutil_regional(matrix(args$z[n, , ], nrow = 1),
                   matrix(args$theta[n, , ], nrow = 1),
                   matrix(args$phi[n, , ], nrow = 1),
                   args$K,
                   args$N)
  } else if (dim(args$z)[3] == 1) {   # When J = 1
    cutil_regional(matrix(args$z[n, , ], ncol = 1),
                   matrix(args$theta[n, , ], ncol = 1),
                   matrix(args$phi[n, , ], ncol = 1),
                   args$K,
                   args$N)
  } else {
    cutil_regional(args$z[n, , ],
                   args$theta[n, , ],
                   args$phi[n, , ],
                   args$K,
                   args$N)
  }
}

# Calculate pi
predict_pi <- function(z, theta, phi, K) {
  I <- nrow(z)
  J <- ncol(z)

  u <- r <- array(dim = c(I, J, K))
  pi <- array(dim = c(I, J, K))

  # Posterior predictive samples of u and r
  for (k in seq_len(K)) {
    u[, , k] <- sample_u(z * theta)
    r[, , k] <- stats::rgamma(I * J, phi, 1)
  }

  # Derive pi
  for (j in seq_len(J)) {
    for (k in seq_len(K)) {
      ur <- (u * r)[, j, k]
      pi[, j, k] <- ur / sum(ur)
    }
  }

  pi
}

# Calculate species detection probabilities (local scale)
predict_detect_probs_local <- function(pi, N) {
  I <- dim(pi)[1]
  J <- dim(pi)[2]

  detect_probs <- matrix(nrow = I, ncol = J)
  for (i in seq_len(I)) {
    for (j in seq_len(J)) {
      detect_probs[i, j] <- 1 - prod((1 - pi[i, j, ])^N)
    }
  }

  detect_probs
}

# Calculate species detection probabilities (regional scale)
predict_detect_probs_regional <- function(pi, N) {
  I <- dim(pi)[1]

  detect_probs <- vector(length = I)
  for (i in seq_len(I)) {
    detect_probs[i] <- 1 - prod((1 - pi[i, , ])^N)
  }

  detect_probs
}

# Find the maximum K value under the specified budget for regional assessment
find_maxK <- function(budget, lambda2, lambda3, ulim = 1E4) {
  J <- 1
  for (n in 2:log10(ulim)) {
    K <- seq_len(10^n)

    if (any(budget - lambda2 * J * K - lambda3 * J > 0)) {
      maxK <- max(K[budget - lambda2 * J * K - lambda3 * J > 0])
    } else {
      maxK <- 0
    }

    if (maxK < length(K)) {
      break
    }

    if (n == log10(ulim)) {
      stop("Maximum `K` value seems too large under the specified budget and cost values: consider using the `K` argument to specify a smaller set of `K` values of interest.\n")
    }
  }
  maxK
}

# Find the maximum J value under the specified budget for regional assessment
find_maxJ <- function(budget, lambda2, lambda3, ulim = 1E6) {
  K <- 1
  for (n in 4:log10(ulim)) {
    J <- seq_len(10^n)

    if (any(budget - lambda2 * J * K - lambda3 * J > 0)) {
      maxJ <- max(J[budget - lambda2 * J * K - lambda3 * J > 0])
    } else {
      maxJ <- 0
    }

    if (maxJ < length(J)) {
      break
    }

    if (n == log10(ulim)) {
      stop("Maximum `J` value seems too large under the specified budget and cost values: consider using the `J` argument to specify a smaller set of `J` values of interest.\n")
    }
  }
  maxJ
}

# Sample z except for all zero cases
sample_z <- function(psi) {
  z <- stats::rbinom(length(psi), 1, psi)

  # Resample when all species have z = 0
  if (sum(z)) {
    return(z)
  } else {
    for (i in seq_len(10000)) {
      z <- stats::rbinom(length(psi), 1, psi)
      if (sum(z)) return(z)
    }
    stop("Failed to generate valid 'z' values under the given parameter set. Providing 'psi' containing higher psi values may fix the issue.")
  }
}

# Sample u except for all zero cases
sample_u <- function(z_theta) {
  u <- array(stats::rbinom(nrow(z_theta) * ncol(z_theta), 1, z_theta),
             dim = dim(z_theta))

  # Find cases where all species have u = 0 to resample
  allzero <- which(colSums(u) == 0)
  if (length(allzero)) {
    for (n in seq_along(allzero)) {
      for (i in seq_len(10000)) {
        u[, allzero[n]] <- stats::rbinom(nrow(z_theta), 1, z_theta[, allzero[n]])
        if (sum(u[, allzero[n]]) == 0) break
      }
    }
    if (any(colSums(u) == 0)) {
      stop("Failed to generate valid 'u' values under the given parameter set. Providing 'theta' or 'psi' containing higher probability values may fix the issue.")
    } else {
      return(u)
    }
  } else {
    return(u)
  }
}

get_z_i <- function(psi, has_site_dim, site_use) {
  out <- array(NA, dim = c(dim(psi)[1], dim(psi)[2], ncol(site_use)))

  if (has_site_dim) {
    for (j in seq_len(dim(out)[3])) {
      for (n in seq_len(dim(out)[1])) {
        out[n, , j] <- sample_z(psi[n, , site_use[n, j]])
      }
    }
  } else {
    for (j in seq_len(dim(out)[3])) {
      for (n in seq_len(dim(out)[1])) {
        out[n, , j] <- sample_z(psi[n, ])
      }
    }
  }

  out
}

get_theta_phi_i <- function(param, has_site_dim, site_use) {
  if (has_site_dim) {
    out <- array(NA, dim = c(dim(param)[1], dim(param)[2], ncol(site_use)))

    for (j in seq_len(dim(out)[3])) {
      for (n in seq_len(dim(out)[1])) {
        out[n, , j] <- param[n, , site_use[n, j]]
      }
    }
  } else {
    out <- outer(param, rep(1, ncol(site_use)))
  }

  out
}

resample_z_psi_theta_phi <- function(param, has_site_dim, n_sample) {
  if (has_site_dim) {
    return(param[sample.int(dim(param)[1], n_sample, replace = TRUE), , ])
  } else {
    return(param[sample.int(dim(param)[1], n_sample, replace = TRUE), ])
  }
}

copy_psi_theta_phi <- function(param, has_site_dim, N_rep) {
  if (has_site_dim) {
    out <- param[rep(seq_len(dim(param)[1]), N_rep), , ]
  } else {
    out <- param[rep(seq_len(dim(param)[1]), N_rep), ]
  }
  out
}

Try the occumb package in your browser

Any scripts or data that you put into this service are public.

occumb documentation built on June 8, 2025, 12:01 p.m.