R/input-data.R
In RoBMA: Robust Bayesian Meta-Analyses

Documented in print.RoBMA_data

#' @title Input Data Specification
#' @name data_input
#' @description
#' Shared data-input arguments used by the RoBMA fitting functions.
#'
#' Normal models use approximate effect-size estimates supplied through
#' `yi` with either `vi` or `sei`. GLMM models use the raw two-arm count
#' arguments for binomial (`measure = "OR"`) or Poisson (`measure = "IRR"`)
#' outcomes.
#'
#' @param yi a vector of effect sizes, or a formula with the effect size on the
#' left-hand side and location moderators on the right-hand side (for example
#' `yi ~ x1 + x2`). If a formula is supplied, `mods` must not be specified.
#' @param vi a vector of sampling variances. Either `vi` or `sei` must be
#' supplied for normal models.
#' @param sei a vector of standard errors. Either `vi` or `sei` must be
#' supplied for normal models.
#' @param weights an optional vector of positive likelihood weights. For
#' normal/effect-size models, each weight powers the estimate likelihood. For
#' constructors with GLMM raw-count input, each weight powers the paired
#' two-arm likelihood for one study.
#' @param ni an optional vector of sample sizes. Used for `measure = "GEN"`
#' or when estimating `"UISD"`).
#' @param mods an optional matrix, data.frame, or formula specifying
#' location moderators (meta-regressors). Formula input is evaluated in `data`.
#' @param scale an optional matrix, data.frame, or formula specifying
#' scale predictors for location-scale models. Formula input is evaluated in
#' `data`.
#' @param cluster an optional vector of cluster identifiers for multilevel
#' meta-analysis.
#' @param data an optional data frame containing the variables.
#' @param slab an optional vector of study labels.
#' @param subset an optional logical or numeric vector specifying a subset of
#' data to be used.
#' @param measure a character string specifying the effect size measure.
#' Normal/effect-size constructors require an explicit value and support
#' `"SMD"`, `"ZCOR"`, `"RR"`, `"OR"`, `"HR"`, `"RD"`, `"IRR"`, and `"GEN"`.
#' Use `"GEN"` only for general effect sizes without a known unit information
#' standard deviation. GLMM raw-count constructors support only `"OR"` and
#' `"IRR"` and default to `"OR"`.
#' @param effect_direction direction used by publication-bias adjustments.
#' `"positive"` assumes statistically significant positive estimates are more
#' likely to be selected; `"negative"` mirrors the selection direction;
#' `"detect"` infers the direction from the fitted data.
#' @param ai a vector of the number of events in the treatment or experimental
#' group for binomial GLMM models.
#' @param bi a vector of the number of non-events in the treatment or
#' experimental group for binomial GLMM models.
#' @param ci a vector of the number of events in the control group for binomial
#' GLMM models.
#' @param di a vector of the number of non-events in the control group for
#' binomial GLMM models.
#' @param n1i a vector of the sample size in the treatment or experimental
#' group. If omitted for binomial GLMMs, it is computed as `ai + bi`.
#' @param n2i a vector of the sample size in the control group. If omitted for
#' binomial GLMMs, it is computed as `ci + di`.
#' @param x1i a vector of the number of events in the treatment/experimental group
#' (for Poisson data).
#' @param x2i a vector of the number of events in the control group (for Poisson data).
#' @param t1i a vector of the person-time in the treatment/experimental group.
#' @param t2i a vector of the person-time in the control group.
#'
NULL



# Internal helper function to extract a variable from a data frame or environment
# Supports three input formats:
#   1. Direct vector: yi = c(0.1, 0.2, 0.3)
#   2. Variable name (unquoted): yi = effect_size (where effect_size is a column in data)
#   3. String name (quoted): yi = "effect_size" (where "effect_size" is a column name in data)
#
# @param mf The matched call from the parent function
# @param data The data frame to search in (can be NULL)
# @param enclos The enclosing environment for evaluation
# @param name The name of the argument (for error messages)
# @param allow_NULL Logical; if TRUE, NULL values are allowed
# @return The extracted vector or NULL
.get_variable <- function(mf, data, enclos, name, allow_NULL = TRUE) {

  # Check if argument is in the call
  arg_index <- match(name, names(mf))

  # If argument was not specified in the call, return NULL
  if (is.na(arg_index)) {
    return(NULL)
  }

  # Get the unevaluated expression from the matched call
  mf_x <- mf[[arg_index]]

  # If the argument value itself is NULL, return NULL
  if (is.null(mf_x)) {
    return(NULL)
  }

  # First, try to evaluate the expression directly in the data frame (if provided)
  if (!is.null(data) && is.data.frame(data)) {
    out <- try(eval(mf_x, data, enclos), silent = TRUE)
  } else {
    out <- try(eval(mf_x, enclos), silent = TRUE)
  }

  # If evaluation succeeded and result is a valid vector, return it
  if (!inherits(out, "try-error") && !is.function(out)) {

    # If result is a single string matching a column name in data, extract that column
    if (is.character(out) && length(out) == 1 && !is.null(data) && is.data.frame(data) && out %in% names(data)) {
      out <- data[[out]]
    }

    # Strip attributes (e.g., from metafor::escalc output) from atomic vectors only
    # Do not strip from formulas, data.frames, lists, or other complex objects
    if (is.atomic(out) && !inherits(out, "formula")) {
      out <- as.vector(out)
    }

    return(out)
  }

  # If direct evaluation failed, check if it's a string referring to a column
  if (is.character(mf_x) && length(mf_x) == 1 && !is.null(data) && is.data.frame(data) && mf_x %in% names(data)) {
    return(as.vector(data[[mf_x]]))
  }

  # If still not found, report error
  if (!allow_NULL) {
    stop(paste0("Cannot find the object/variable ('", deparse(mf_x),
                "') specified for the '", name, "' argument."), call. = FALSE)
  }

  return(NULL)
}


# Internal function to extract common optional variables shared by all outcome handlers
# Handles weights, cluster, and slab extraction and validation
#
# @param .call Matched call from the calling function
# @param data The data frame (can be NULL)
# @param .envir The enclosing environment
# @param k The expected number of observations
# @param primary_var The name of the primary variable (for error messages, e.g., "yi" or "ai")
#
# @return A list with:
#   - weights: numeric vector or NULL
#   - cluster: vector or NULL
#   - slab: character vector or NULL
#   - slab_provided: logical
#   - cluster_provided: logical
.check_and_list_data.optional_vars <- function(.call, data, .envir, k, primary_var) {

  # Extract optional variables
  weights   <- .get_variable(.call, data, .envir, "weights",   allow_NULL = TRUE)
  cluster   <- .get_variable(.call, data, .envir, "cluster",   allow_NULL = TRUE)
  slab      <- .get_variable(.call, data, .envir, "slab",      allow_NULL = TRUE)

  # Track which optional fields were provided
  weights_provided   <- !is.null(weights)
  slab_provided      <- !is.null(slab)
  cluster_provided   <- !is.null(cluster)

  # Validate weights
  if (!is.null(weights) && anyNA(weights))
    stop("The 'weights' argument must not contain missing values.", call. = FALSE)
  if (!is.null(weights))
    BayesTools::check_real(weights, "weights", check_length = k, allow_NULL = TRUE, allow_NA = FALSE, lower = 0, allow_bound = FALSE)

  # Validate cluster
  if (!is.null(cluster) && length(cluster) != k)
    stop(paste0("The 'cluster' argument must have length ", k, " (same as '", primary_var, "')."), call. = FALSE)
  if (!is.null(cluster) && anyNA(cluster))
    stop("The 'cluster' argument must not contain missing values.", call. = FALSE)

  # Validate slab (study labels)
  if (!is.null(slab) && length(slab) != k)
    stop(paste0("The 'slab' argument must have length ", k, " (same as '", primary_var, "')."), call. = FALSE)

  return(list(
    weights            = weights,
    cluster            = cluster,
    slab               = slab,
    weights_provided   = weights_provided,
    slab_provided      = slab_provided,
    cluster_provided   = cluster_provided
  ))
}


# Internal function to check and list input data for RoBMA
# This is the main entry point for processing input data. It dispatches to
# the appropriate outcome handler based on the class argument.
#
# @param .call Matched call from the calling function (required for NSE)
# @param .envir The environment where the calling function was invoked (required for NSE)
# @param class The model class: "norm" for normal likelihood (yi, sei) or
#              "glmm" for generalized linear mixed models (ai, bi, ci, di, etc.)
# @param measure The effect size measure (used for "glmm" class to determine
#                outcome type: "OR" for binomial, "IRR" for Poisson)
# @param standardize_continuous_predictors Whether to standardize continuous predictors
# @param skip_validation Whether to skip predictor validation checks (variability/levels).
#        Should be TRUE when processing newdata for prediction. (validation done in BayesTools::JAGS_formula())
#
# @return A list with components:
#   - outcome: data.frame with outcome variables (columns depend on class)
#   - mods: moderator information (data.frame or NULL)
#   - scale: scale information (data.frame or NULL)
.check_and_list_data <- function(.call, .envir, class = "norm", measure,
                                 set_contrast_factor_predictors, standardize_continuous_predictors,
                                 effect_direction = "positive", skip_validation = FALSE) {

  # check additional input
  .check_measure(measure, class = class)
  BayesTools::check_bool(standardize_continuous_predictors, "standardize_continuous_predictors", allow_NA = FALSE)
  BayesTools::check_char(set_contrast_factor_predictors, "set_contrast_factor_predictors", allow_values = c("treatment", "meandif", "orthonormal"), allow_NA = FALSE)
  if (missing(effect_direction)) {
    effect_direction <- "positive"
  }
  BayesTools::check_char(effect_direction, "effect_direction", allow_values = c("positive", "negative", "detect"))
  BayesTools::check_bool(skip_validation, "skip_validation")

  ### Extract the data argument first - other variables may reference columns within it
  data <- .get_variable(.call, NULL, .envir, "data", allow_NULL = TRUE)

  ### Step 1: Extract and validate outcome variables (dispatch based on class)
  outcome_result <- switch(
    class,
    "norm" = .check_and_list_data.outcome.norm(.call, data, .envir, effect_direction, skip_validation),
    "glmm" = switch(
      measure,
      "OR"  = .check_and_list_data.outcome.bin(.call, data, .envir, skip_validation),
      "IRR" = .check_and_list_data.outcome.pois(.call, data, .envir, skip_validation)
    )
  )

  data_outcome       <- outcome_result$data_outcome
  k                  <- outcome_result$k
  mods_from_yi       <- outcome_result$mods_from_yi
  formula_yi         <- outcome_result$formula_yi
  weights_provided   <- outcome_result$weights_provided
  slab_provided      <- outcome_result$slab_provided
  cluster_provided   <- outcome_result$cluster_provided
  na_check_cols      <- outcome_result$na_check_cols
  outcome_type       <- outcome_result$outcome_type
  effect_direction   <- outcome_result$effect_direction

  ### Step 2: Extract moderator variables (mods and scale)
  if (!is.null(mods_from_yi)) {
    # Mods already extracted from yi formula
    data_mods <- as.data.frame(mods_from_yi)
    attr(data_mods, "formula")    <- formula_yi[-2]  # RHS only
    attr(data_mods, "formula_yi") <- formula_yi
  } else {
    data_mods <- .check_and_list_data.predictors(
      .call  = .call,
      data   = data,
      .envir = .envir,
      name   = "mods",
      k      = k
    )
  }

  data_scale <- .check_and_list_data.predictors(
    .call  = .call,
    data   = data,
    .envir = .envir,
    name   = "scale",
    k      = k
  )

    data_mods  <- .check_and_list_data.coerce_character_predictors(data_mods)
    data_scale <- .check_and_list_data.coerce_character_predictors(data_scale)

  ### Step 3: Apply subset (before NA handling)
  subset <- .get_variable(.call, data, .envir, "subset", allow_NULL = TRUE)

  if (!is.null(subset)) {
    # Validate and convert subset to logical
    subset <- .check_and_list_data.validate_subset(subset, k)

    # Apply subsetting to all data frames
    data_outcome <- .check_and_list_data.subset(data_outcome, subset)
    data_mods    <- .check_and_list_data.subset(data_mods, subset)
    data_scale   <- .check_and_list_data.subset(data_scale, subset)
  }

  ### Step 4: Handle NA dropping
  # Build list of data frames to check for NAs
  # Use only the columns specified by the outcome handler for NA checking
  data_list_for_na <- list(
    outcome_required = data_outcome[, na_check_cols, drop = FALSE]
  )
  if (!is.null(data_mods)) {
    data_list_for_na$mods <- data_mods
  }
  if (!is.null(data_scale)) {
    data_list_for_na$scale <- data_scale
  }

  # Get rows with NAs
  na_rows <- .check_and_list_data.check_na(data_list_for_na)

  # Drop NA rows if any
  n_dropped <- sum(na_rows)
  if (n_dropped > 0) {

    warning(paste0(n_dropped, " observation(s) removed due to missing values."), call. = FALSE, immediate. = TRUE)

    keep_rows <- !na_rows
    data_outcome <- .check_and_list_data.subset(data_outcome, keep_rows)
    data_mods    <- .check_and_list_data.subset(data_mods, keep_rows)
    data_scale   <- .check_and_list_data.subset(data_scale, keep_rows)
  }

  ### Step 5: Final validation and processing
  k_final <- nrow(data_outcome)

  if (k_final == 0) {
    stop("No observations remaining after removing missing values.", call. = FALSE)
  }

  if (outcome_type == "norm" && effect_direction == "detect") {
    effect_direction <- if (stats::median(data_outcome[["yi"]], na.rm = TRUE) >= 0) "positive" else "negative"
  }

  # Validate predictor variables (after subsetting and NA dropping)
  # Skip validation when processing newdata (skip_validation = TRUE)
  .check_and_list_data.validate_predictors(data_mods, "mods", skip_validation)
  .check_and_list_data.validate_predictors(data_scale, "scale", skip_validation)

  # Generate default study labels if not provided (after NA dropping)
  if (!slab_provided) {
    data_outcome$slab <- paste0("Study ", seq_len(k_final))
  }

  # Convert cluster to numeric factor for use as indices in fitting (after NA dropping)
  if (cluster_provided) {
    if (anyNA(data_outcome[["cluster"]])) {
      stop("The 'cluster' argument must not contain missing values.", call. = FALSE)
    }

    cluster_factor <- factor(data_outcome[["cluster"]], levels = unique(data_outcome[["cluster"]]))
    data_outcome[["cluster_label"]] <- as.character(cluster_factor)
    data_outcome[["cluster"]]       <- as.integer(cluster_factor)
  }

  ### Create output object
  data_list <- list(
    outcome = data_outcome,
    mods    = data_mods,
    scale   = data_scale
  )

  class(data_list) <- "RoBMA_data"
  attr(data_list, "outcome_type")                       <- outcome_type
  attr(data_list, "measure")                            <- measure
  attr(data_list, "n_dropped")                          <- n_dropped
  attr(data_list, "k_final")                            <- k_final
  attr(data_list, "mods")                               <- !is.null(data_mods)
  attr(data_list, "scale")                              <- !is.null(data_scale)
  attr(data_list, "weights")                            <- weights_provided
  attr(data_list, "slab")                               <- slab_provided
  attr(data_list, "cluster")                            <- cluster_provided
  attr(data_list, "standardize_continuous_predictors")  <- standardize_continuous_predictors
  attr(data_list, "set_contrast_factor_predictors")     <- set_contrast_factor_predictors
  attr(data_list, "effect_direction")                   <- effect_direction
  return(data_list)
}


# Internal function to extract and validate outcome variables for normal likelihood models
# Handles yi, vi/sei, ni, weights, cluster, slab, and yi as formula
#
# @param .call Matched call from the calling function
# @param data The data frame (can be NULL)
# @param .envir The enclosing environment
# @param effect_direction The effect direction ("positive", "negative", or "detect")
# @param skip_validation Whether to skip strict validation checks (for newdata).
#        When TRUE, allows sei/vi/ni = 0 (useful for prediction at zero SE).
#
# @return A list with:
#   - data_outcome: data.frame with yi, sei, ni, cluster, slab, weights
#   - k: number of observations
#   - mods_from_yi: moderators extracted from yi formula (or NULL)
#   - formula_yi: the original yi formula (or NULL)
#   - slab_provided: logical, whether slab was provided by user
#   - cluster_provided: logical, whether cluster was provided by user
#   - na_check_cols: character vector of column names to check for NAs
.check_and_list_data.outcome.norm <- function(.call, data, .envir, effect_direction, skip_validation = FALSE) {

  # Extract yi (may be a formula like yi ~ mod1 + mod2)
  yi <- .get_variable(.call, data, .envir, "yi", allow_NULL = FALSE)

  # Handle yi as formula (e.g., yi ~ mod1 + mod2)
  formula_yi   <- NULL
  mods_from_yi <- NULL

  if (inherits(yi, "formula")) {

    formula_yi <- yi

    # Check that mods is not also specified (would be ambiguous)
    mods_check <- .get_variable(.call, data, .envir, "mods", allow_NULL = TRUE)
    if (!is.null(mods_check)) {
      stop("Cannot specify 'mods' when 'yi' is a formula. Use either 'yi ~ mod1 + mod2' or 'yi = effect, mods = ~ mod1 + mod2', but not both.", call. = FALSE)
    }

    # Extract the model frame from the formula
    na_act <- getOption("na.action")
    options(na.action = "na.pass")
    on.exit(options(na.action = na_act), add = TRUE)

    full_mf <- stats::model.frame(yi, data = data)

    # Extract response (LHS)
    yi <- stats::model.response(full_mf)
    names(yi) <- NULL

    # Extract predictors (RHS)
    if (ncol(full_mf) > 1) {
      mods_from_yi <- full_mf[, -1, drop = FALSE]
    }
  }

  # Extract variance/standard error
  vi  <- .get_variable(.call, data, .envir, "vi",  allow_NULL = TRUE)
  sei <- .get_variable(.call, data, .envir, "sei", allow_NULL = TRUE)

  # Extract ni (sample sizes) - specific to normal likelihood models
  ni <- .get_variable(.call, data, .envir, "ni", allow_NULL = TRUE)

  ### Input validation

  # Validate yi
  BayesTools::check_real(yi, "yi", check_length = 0, allow_NULL = FALSE, allow_NA = TRUE)
  if (all(is.na(yi)))
    stop("The 'yi' argument must contain at least one non-NA value.", call. = FALSE)

  k <- length(yi)

  # Validate vi (sampling variance)
  # When skip_validation = TRUE (newdata), allow vi = 0 for prediction at zero variance
  if (!is.null(vi))
    BayesTools::check_real(vi, "vi", check_length = k, allow_NULL = TRUE, allow_NA = TRUE, lower = 0, allow_bound = skip_validation)

  # Validate sei (standard error)
  # When skip_validation = TRUE (newdata), allow sei = 0 for prediction at zero SE
  if (!is.null(sei))
    BayesTools::check_real(sei, "sei", check_length = k, allow_NULL = TRUE, allow_NA = TRUE, lower = 0, allow_bound = skip_validation)

  # Convert vi to sei or vice versa
  if (is.null(sei) && !is.null(vi)) {
    sei <- sqrt(vi)
  } else if (is.null(vi) && !is.null(sei)) {
    vi <- sei^2
  } else if (is.null(vi) && is.null(sei)) {
    stop("Either 'vi' (variance) or 'sei' (standard error) must be provided.", call. = FALSE)
  } else {
    # Both provided - check consistency
    if (any(abs(vi - sei^2) > 1e-10, na.rm = TRUE))
      stop("The provided 'vi' and 'sei' values are inconsistent.", call. = FALSE)
  }

  # Validate ni (sample sizes)
  # When skip_validation = TRUE (newdata), allow ni = 0
  if (!is.null(ni))
    BayesTools::check_real(ni, "ni", check_length = k, allow_NULL = TRUE, allow_NA = TRUE, lower = 0, allow_bound = skip_validation)

  # Extract and validate common optional variables (weights, cluster, slab)
  optional <- .check_and_list_data.optional_vars(.call, data, .envir, k, "yi")

  ### Construct output data frame
  data_outcome <- data.frame(
    yi            = yi,
    sei           = sei,
    ni            = if (!is.null(ni))                  ni               else rep(NA_integer_, k),
    cluster       = if (!is.null(optional$cluster))    optional$cluster else rep(NA_character_, k),
    cluster_label = if (!is.null(optional$cluster))    as.character(optional$cluster) else rep(NA_character_, k),
    slab          = if (!is.null(optional$slab))       optional$slab    else rep(NA_character_, k),
    weights       = if (!is.null(optional$weights))    optional$weights else rep(NA, k),
    stringsAsFactors = FALSE
  )

  return(list(
    data_outcome       = data_outcome,
    k                  = k,
    mods_from_yi       = mods_from_yi,
    formula_yi         = formula_yi,
    weights_provided   = optional$weights_provided,
    slab_provided      = optional$slab_provided,
    cluster_provided   = optional$cluster_provided,
    na_check_cols      = c("yi", "sei"),  # Only check these columns for NAs
    effect_direction   = effect_direction,
    outcome_type       = "norm"
  ))
}


# Internal function to extract and validate outcome variables for binomial GLMM models
# Handles ai, bi, ci, di, n1i, n2i, weights, cluster, slab (for measure = "OR")
#
# @param .call Matched call from the calling function
# @param data The data frame (can be NULL)
# @param .envir The enclosing environment
#
# @return A list with:
#   - data_outcome: data.frame with ai, ci, n1i, n2i, cluster, slab, weights
#   - k: number of observations
#   - mods_from_yi: NULL (GLMM does not support formula syntax for outcome)
#   - formula_yi: NULL
#   - slab_provided: logical, whether slab was provided by user
#   - cluster_provided: logical, whether cluster was provided by user
#   - na_check_cols: character vector of column names to check for NAs
# @param skip_validation Whether to skip strict validation checks (for newdata).
.check_and_list_data.outcome.bin <- function(.call, data, .envir, skip_validation = FALSE) {

  # Extract cell counts for 2x2 tables
  ai  <- .get_variable(.call, data, .envir, "ai",  allow_NULL = TRUE)
  bi  <- .get_variable(.call, data, .envir, "bi",  allow_NULL = TRUE)
  ci  <- .get_variable(.call, data, .envir, "ci",  allow_NULL = TRUE)
  di  <- .get_variable(.call, data, .envir, "di",  allow_NULL = TRUE)
  n1i <- .get_variable(.call, data, .envir, "n1i", allow_NULL = TRUE)
  n2i <- .get_variable(.call, data, .envir, "n2i", allow_NULL = TRUE)

  ### Determine input format and validate

  # Determine k from first available variable
  if (!is.null(ai)) {
    k <- length(ai)
  } else {
    stop("For GLMM models, provide either (ai, bi, ci, di) or (ai, ci, n1i, n2i).", call. = FALSE)
  }

  count_args <- list(ai = ai, bi = bi, ci = ci, di = di, n1i = n1i, n2i = n2i)
  for (arg_name in names(count_args)) {
    if (!is.null(count_args[[arg_name]])) {
      BayesTools::check_int(
        count_args[[arg_name]], arg_name,
        check_length = k, allow_NULL = FALSE, allow_NA = TRUE, lower = 0
      )
    }
  }

  has_cells  <- !is.null(ci) && !is.null(bi) && !is.null(di)
  has_totals <- !is.null(ci) && !is.null(n1i) && !is.null(n2i)

  if (!has_cells && !has_totals) {
    stop("For GLMM models, provide either (ai, bi, ci, di) or (ai, ci, n1i, n2i).", call. = FALSE)
  }

  if (!is.null(n1i) && any(ai > n1i, na.rm = TRUE))
    stop("Invalid data: some values of 'bi' (= n1i - ai) are negative.", call. = FALSE)
  if (!is.null(n2i) && any(ci > n2i, na.rm = TRUE))
    stop("Invalid data: some values of 'di' (= n2i - ci) are negative.", call. = FALSE)

  if (!is.null(bi) && !is.null(n1i) && any(n1i != ai + bi, na.rm = TRUE))
    stop("The provided 'n1i' values must equal 'ai + bi' when both are supplied.", call. = FALSE)
  if (!is.null(di) && !is.null(n2i) && any(n2i != ci + di, na.rm = TRUE))
    stop("The provided 'n2i' values must equal 'ci + di' when both are supplied.", call. = FALSE)

  if (!is.null(bi)) {
    n1i <- ai + bi
  }
  if (!is.null(di)) {
    n2i <- ci + di
  }

  bi <- n1i - ai
  di <- n2i - ci

  # Extract and validate common optional variables (weights, cluster, slab)
  optional <- .check_and_list_data.optional_vars(.call, data, .envir, k, "ai")

  ### Construct output data frame
  data_outcome <- data.frame(
    ai            = ai,
    ci            = ci,
    n1i           = n1i,
    n2i           = n2i,
    cluster       = if (!is.null(optional$cluster)) optional$cluster else rep(NA_character_, k),
    cluster_label = if (!is.null(optional$cluster)) as.character(optional$cluster) else rep(NA_character_, k),
    slab          = if (!is.null(optional$slab))      optional$slab      else rep(NA_character_, k),
    weights       = if (!is.null(optional$weights))   optional$weights   else rep(NA, k),
    stringsAsFactors = FALSE
  )

  return(list(
    data_outcome       = data_outcome,
    k                  = k,
    mods_from_yi       = NULL,  # GLMM does not support formula syntax
    formula_yi         = NULL,
    weights_provided   = optional$weights_provided,
    slab_provided      = optional$slab_provided,
    cluster_provided   = optional$cluster_provided,
    na_check_cols      = c("ai", "ci", "n1i", "n2i"),  # Check cell counts for NAs
    effect_direction   = "positive", # Non-consequential, for consistency with .norm
    outcome_type       = "bin"
  ))
}


# Internal function to extract and validate outcome variables for Poisson GLMM models
# Handles x1i, x2i, t1i, t2i, weights, cluster, slab (for measure = "IRR")
#
# @param .call Matched call from the calling function
# @param data The data frame (can be NULL)
# @param .envir The enclosing environment
# @param skip_validation Whether to skip strict validation checks (for newdata).
#        When TRUE, allows t1i/t2i = 0 (useful for prediction).
#
# @return A list with:
#   - data_outcome: data.frame with x1i, x2i, t1i, t2i, cluster, slab, weights
#   - k: number of observations
#   - mods_from_yi: NULL (GLMM does not support formula syntax for outcome)
#   - formula_yi: NULL
#   - slab_provided: logical, whether slab was provided by user
#   - cluster_provided: logical, whether cluster was provided by user
#   - na_check_cols: character vector of column names to check for NAs
.check_and_list_data.outcome.pois <- function(.call, data, .envir, skip_validation = FALSE) {

  # Extract event counts and person-time for Poisson models
  x1i <- .get_variable(.call, data, .envir, "x1i", allow_NULL = TRUE)
  x2i <- .get_variable(.call, data, .envir, "x2i", allow_NULL = TRUE)
  t1i <- .get_variable(.call, data, .envir, "t1i", allow_NULL = TRUE)
  t2i <- .get_variable(.call, data, .envir, "t2i", allow_NULL = TRUE)

  ### Validate that all required variables are provided
  if (is.null(x1i) || is.null(x2i) || is.null(t1i) || is.null(t2i)) {
    stop("For Poisson models (measure = 'IRR'), all of 'x1i', 'x2i', 't1i', and 't2i' must be provided.", call. = FALSE)
  }

  # Determine k from x1i
  k <- length(x1i)

  ### Validate inputs
  # x1i: event counts in treatment group
  BayesTools::check_int(x1i, "x1i", check_length = k, allow_NULL = FALSE, allow_NA = TRUE, lower = 0)

  # x2i: event counts in control group
  BayesTools::check_int(x2i, "x2i", check_length = k, allow_NULL = FALSE, allow_NA = TRUE, lower = 0)

  # t1i: person-time in treatment group
  # When skip_validation = TRUE (newdata), allow t1i = 0
  BayesTools::check_real(t1i, "t1i", check_length = k, allow_NULL = FALSE, allow_NA = TRUE, lower = 0, allow_bound = skip_validation)

  # t2i: person-time in control group
  # When skip_validation = TRUE (newdata), allow t2i = 0
  BayesTools::check_real(t2i, "t2i", check_length = k, allow_NULL = FALSE, allow_NA = TRUE, lower = 0, allow_bound = skip_validation)

  # Extract and validate common optional variables (weights, cluster, slab)
  optional <- .check_and_list_data.optional_vars(.call, data, .envir, k, "x1i")

  ### Construct output data frame
  data_outcome <- data.frame(
    x1i           = x1i,
    x2i           = x2i,
    t1i           = t1i,
    t2i           = t2i,
    cluster       = if (!is.null(optional$cluster)) optional$cluster else rep(NA_character_, k),
    cluster_label = if (!is.null(optional$cluster)) as.character(optional$cluster) else rep(NA_character_, k),
    slab          = if (!is.null(optional$slab))      optional$slab      else rep(NA_character_, k),
    weights       = if (!is.null(optional$weights))   optional$weights   else rep(NA, k),
    stringsAsFactors = FALSE
  )

  return(list(
    data_outcome       = data_outcome,
    k                  = k,
    mods_from_yi       = NULL,  # GLMM does not support formula syntax
    formula_yi         = NULL,
    weights_provided   = optional$weights_provided,
    slab_provided      = optional$slab_provided,
    cluster_provided   = optional$cluster_provided,
    na_check_cols      = c("x1i", "x2i", "t1i", "t2i"),  # Check Poisson data for NAs
    effect_direction   = "positive", # Non-consequential, for consistency with .norm
    outcome_type       = "pois"
  ))
}


# Internal function to extract and validate predictor variables (mods or scale)
# Handles formulas, matrices, and data.frames
#
# @param .call Matched call from the calling function
# @param data The data frame (can be NULL)
# @param .envir The enclosing environment
# @param name The name of the argument ("mods" or "scale")
# @param k The expected number of rows
#
# @return A data.frame with predictor variables (with "formula" attribute) or NULL
.check_and_list_data.predictors <- function(.call, data, .envir, name, k) {

  # Check if argument is in the call
  arg_index <- match(name, names(.call))

  if (is.na(arg_index))
    return(NULL)

  mods_expr <- .call[[arg_index]]

  if (is.null(mods_expr))
    return(NULL)

  # Evaluate the expression to get the formula, matrix, or data.frame
  if (!is.null(data) && is.data.frame(data)) {
    mods <- try(eval(mods_expr, data, .envir), silent = TRUE)
  } else {
    mods <- try(eval(mods_expr, .envir), silent = TRUE)
  }

  if (inherits(mods, "try-error"))
    stop(paste0("Cannot evaluate the '", name, "' argument: ",
                conditionMessage(attr(mods, "condition"))), call. = FALSE)

  # Handle formula input
  if (inherits(mods, "formula")) {

    original_formula <- mods

    # Ensure formula has no LHS (response)
    if (length(mods) == 3) {
      warning(paste0("The '", name, "' formula should not have a left-hand side. ",
                     "The LHS will be ignored."), call. = FALSE)
      mods <- mods[-2]
      original_formula <- mods
    }

    # Create model frame from formula
    if (!is.null(data) && is.data.frame(data)) {
      mf <- try(
        stats::model.frame(mods, data = data, na.action = stats::na.pass),
        silent = TRUE
      )
    } else {
      mf <- try(
        stats::model.frame(mods, na.action = stats::na.pass),
        silent = TRUE
      )
    }

    if (inherits(mf, "try-error"))
      stop(paste0("Cannot create model frame from '", name, "' formula: ",
                  conditionMessage(attr(mf, "condition"))), call. = FALSE)

    if (nrow(mf) != k)
      stop(paste0("The number of rows in '", name, "' (", nrow(mf),
                  ") must equal the number of effect sizes (", k, ")."), call. = FALSE)

    rownames(mf) <- NULL
    attr(mf, "formula") <- original_formula

    return(mf)

  } else if (is.matrix(mods)) {

    mods_df <- as.data.frame(mods)

    if (nrow(mods_df) != k)
      stop(paste0("The number of rows in '", name, "' (", nrow(mods_df),
                  ") must equal the number of effect sizes (", k, ")."), call. = FALSE)

    rownames(mods_df) <- NULL
    attr(mods_df, "formula") <- .create_formula_from_names(names(mods_df))

    return(mods_df)

  } else if (is.data.frame(mods)) {

    mods_df <- mods

    if (nrow(mods_df) != k)
      stop(paste0("The number of rows in '", name, "' (", nrow(mods_df),
                  ") must equal the number of effect sizes (", k, ")."), call. = FALSE)

    rownames(mods_df) <- NULL
    attr(mods_df, "formula") <- .create_formula_from_names(names(mods_df))

    return(mods_df)

  } else {
    stop(paste0("The '", name, "' argument must be a formula, matrix, or data.frame."),
         call. = FALSE)
  }
}


# Internal function to coerce character predictors to factors
# Character moderators and scale predictors must be factors before downstream
# contrast handling builds design matrices.
#
# @param df A data.frame with predictor variables (can be NULL)
#
# @return The input data.frame with character columns converted to factors
.check_and_list_data.coerce_character_predictors <- function(df) {

  if (is.null(df))
    return(NULL)

  character_columns <- vapply(df, is.character, logical(1))

  if (!any(character_columns))
    return(df)

  for (col_name in names(df)[character_columns]) {
    df[[col_name]] <- factor(df[[col_name]])
  }

  return(df)
}


# Internal function to validate and convert subset argument to logical vector
#
# @param subset The subset argument (logical or numeric vector)
# @param k The expected length (number of observations)
#
# @return A logical vector of length k
.check_and_list_data.validate_subset <- function(subset, k) {

  if (is.logical(subset)) {

    if (length(subset) != k)
      stop(paste0("The 'subset' argument must have length ", k, " (same as the outcome) when logical."), call. = FALSE)

    # Check for NAs in subset
    if (any(is.na(subset)))
      stop("The 'subset' argument must not contain NA values.", call. = FALSE)

    return(subset)

  } else if (is.numeric(subset)) {

    # Check for NAs in subset
    if (any(is.na(subset)))
      stop("The 'subset' argument must not contain NA values.", call. = FALSE)

    if (any(!is.finite(subset)) || any(subset != floor(subset)))
      stop("The 'subset' argument must contain integer numeric indices.", call. = FALSE)

    if (any(subset < 1) || any(subset > k))
      stop(paste0("The 'subset' argument must contain values between 1 and ", k, "."), call. = FALSE)

    # Convert numeric indices to logical
    subset_logical <- rep(FALSE, k)
    subset_logical[subset] <- TRUE
    return(subset_logical)

  } else {
    stop("The 'subset' argument must be a logical or numeric vector.", call. = FALSE)
  }
}


# Internal function to apply subsetting to a data.frame
# Preserves formula attributes and drops unused factor levels
#
# @param df A data.frame (can be NULL)
# @param subset A logical vector indicating which rows to keep
#
# @return The subsetted data.frame (or NULL if input was NULL)
.check_and_list_data.subset <- function(df, subset) {

  if (is.null(df))
    return(NULL)

  # Preserve attributes
  saved_formula    <- attr(df, "formula")
  saved_formula_yi <- attr(df, "formula_yi")

  # Apply subset
  df <- df[subset, , drop = FALSE]

  # Drop unused factor levels
  df <- droplevels(df)

  # Reset row names
  rownames(df) <- NULL

  # Restore attributes
  if (!is.null(saved_formula))
    attr(df, "formula") <- saved_formula
  if (!is.null(saved_formula_yi))
    attr(df, "formula_yi") <- saved_formula_yi

  return(df)
}


# Internal function to validate predictor variables (mods or scale)
# Checks that:
#   - Continuous variables have sd > 0 (are not constant)
#   - Factor variables have more than one level
#
# These checks are appropriate for original data but should be skipped for
# newdata used in prediction, where single-value predictions are legitimate.
#
# @param df A data.frame with predictor variables (can be NULL)
# @param name The name of the predictor type ("mods" or "scale") for error messages
# @param skip_validation Logical; if TRUE, skip all validation checks.
#        Should be TRUE when processing newdata for prediction.
#
# @return NULL (invisibly); stops with error if validation fails
.check_and_list_data.validate_predictors <- function(df, name, skip_validation = FALSE) {

  if (is.null(df))
    return(invisible(NULL))

  # Skip validation for newdata (e.g., predicting at a single factor level or

  # single continuous value is valid for prediction)
  if (skip_validation)
    return(invisible(NULL))

  for (col_name in names(df)) {
    col <- df[[col_name]]

    if (is.factor(col) || is.character(col)) {
      # Factor/character: must have more than one unique level
      n_levels <- length(unique(col))
      if (n_levels < 2) {
        stop(paste0("The '", name, "' variable '", col_name,
                    "' has only one level. Factor predictors must have at least two levels."),
             call. = FALSE)
      }
    } else if (is.numeric(col)) {
      # Numeric: must have sd > 0 (not constant)
      col_sd <- stats::sd(col, na.rm = TRUE)
      if (is.na(col_sd) || col_sd == 0) {
        stop(paste0("The '", name, "' variable '", col_name,
                    "' has zero variance (all values are identical). ",
                    "Continuous predictors must have variation."),
             call. = FALSE)
      }
    }
    # For other types (logical, etc.), no specific check
  }

  return(invisible(NULL))
}


# Internal function to check for NA values across a list of data.frames
# Returns a logical vector indicating which rows have at least one NA
#
# @param data_list A named list of data.frames to check
#
# @return A logical vector where TRUE indicates the row has at least one NA
.check_and_list_data.check_na <- function(data_list) {

  if (length(data_list) == 0)
    return(logical(0))

  # Combine all data.frames into one for vectorized NA check
  combined <- do.call(cbind, Filter(function(df) !is.null(df) && is.data.frame(df), data_list))

  # Vectorized NA check: TRUE if any column in that row has NA
  na_rows <- rowSums(is.na(combined)) > 0

  return(na_rows)
}


# Internal helper to create a formula from column names
# Creates a one-sided formula like ~ var1 + var2 + var3
#
# @param col_names Character vector of column names
# @return A formula object
.create_formula_from_names <- function(col_names) {

  if (length(col_names) == 0) {
    return(~ 1)
  }

  # Backtick names that need protection (contain spaces, special chars, etc.)
  col_names_safe <- vapply(col_names, function(nm) {
    if (make.names(nm) != nm) {
      paste0("`", nm, "`")
    } else {
      nm
    }
  }, character(1), USE.NAMES = FALSE)

  # Create formula string and convert to formula
  formula_str <- paste("~", paste(col_names_safe, collapse = " + "))
  return(stats::as.formula(formula_str))
}


#' @title Print method for RoBMA_data objects
#'
#' @description Prints a concise summary of an RoBMA_data object.
#'
#' @param x an RoBMA_data object
#' @param n maximum number of rows to display for each component. Defaults to 6.
#' @param ... additional arguments (ignored)
#'
#' @return Invisibly returns the input object.
#'
#' @export
print.RoBMA_data <- function(x, n = 6, ...) {

  k_final   <- attr(x, "k_final")
  n_dropped <- attr(x, "n_dropped")

  # Outcome data header
  if (n_dropped > 0) {
    cat(sprintf("Outcome data (k = %d, dropped = %d)\n", k_final, n_dropped))
  } else {
    cat(sprintf("Outcome data (k = %d)\n", k_final))
  }
  print(utils::head(x$outcome, n = n))

  # Moderators (effect)
  if (!is.null(x$mods)) {
    cat("\nModerators (effect)\n")
    print(utils::head(x$mods, n = n))
  }

  # Moderators (scale)
  if (!is.null(x$scale)) {
    cat("\nModerators (scale)\n")
    print(utils::head(x$scale, n = n))
  }

  # Indicate if more rows exist

  if (k_final > n) {
    cat(sprintf("\n... with %d more rows\n", k_final - n))
  }

  invisible(x)
}


# Internal helper to normalize data.frame/list newdata inputs.
.prepare_newdata_as_data_frame <- function(newdata) {

  if (is.list(newdata) && !is.data.frame(newdata)) {
    newdata <- as.data.frame(newdata, stringsAsFactors = FALSE)
  }

  return(newdata)
}


# Internal helper to report missing newdata columns.
.prepare_newdata_stop_missing <- function(newdata, cols) {

  missing_cols <- setdiff(cols, names(newdata))
  if (length(missing_cols) == 0L) {
    return(invisible(TRUE))
  }

  stop(
    "The 'newdata' must contain columns: ",
    paste(missing_cols, collapse = ", "), ".",
    call. = FALSE
  )
}


# Internal helper to add placeholder columns that are not used by prediction.
.prepare_newdata_add_missing <- function(newdata, cols, value, n) {

  for (col in cols) {
    if (!col %in% names(newdata)) {
      newdata[[col]] <- rep(value, n)
    }
  }

  return(newdata)
}


# Internal helper to decide whether normal newdata needs sampling SE.
.prepare_newdata_needs_norm_sei <- function(object, type, bias_adjusted) {

  return(
    type == "response" ||
      (
        !bias_adjusted &&
          type %in% c("terms", "cluster", "estimate") &&
          (.is_PET(object) || .is_PEESE(object))
      )
  )
}


# Internal helper to add outcome placeholders for the shared data parser.
.prepare_newdata_outcome <- function(object, newdata, type, bias_adjusted) {

  outcome_type <- .outcome_type(object)
  n_new        <- nrow(newdata)

  if (outcome_type == "norm") {
    newdata <- .prepare_newdata_add_missing(
      newdata = newdata,
      cols    = "yi",
      value   = 0,
      n       = n_new
    )

    if (!("sei" %in% names(newdata) || "vi" %in% names(newdata))) {
      if (.prepare_newdata_needs_norm_sei(object, type, bias_adjusted)) {
        stop(
          "The 'newdata' must contain either 'sei' (standard error) or ",
          "'vi' (variance) column.",
          call. = FALSE
        )
      }
      newdata[["sei"]] <- rep(0, n_new)
    }

  } else if (outcome_type == "bin") {
    if (type == "response") {
      .prepare_newdata_stop_missing(newdata, c("n1i", "n2i"))
    }
    newdata <- .prepare_newdata_add_missing(
      newdata = newdata,
      cols    = c("ai", "ci", "n1i", "n2i"),
      value   = 0L,
      n       = n_new
    )

  } else if (outcome_type == "pois") {
    if (type == "response") {
      .prepare_newdata_stop_missing(newdata, c("t1i", "t2i"))
    }
    newdata <- .prepare_newdata_add_missing(
      newdata = newdata,
      cols    = c("x1i", "x2i", "t1i", "t2i"),
      value   = 0,
      n       = n_new
    )
  }

  return(newdata)
}


# Internal helper to construct outcome arguments for `.check_and_list_data`.
.prepare_newdata_outcome_call_args <- function(outcome_type, newdata) {

  if (outcome_type == "norm") {
    call_args <- list(yi = quote(yi))

    if ("sei" %in% names(newdata)) {
      call_args[["sei"]] <- quote(sei)
    } else {
      call_args[["vi"]] <- quote(vi)
    }

  } else if (outcome_type == "bin") {
    call_args <- list(
      ai  = quote(ai),
      ci  = quote(ci),
      n1i = quote(n1i),
      n2i = quote(n2i)
    )

  } else if (outcome_type == "pois") {
    call_args <- list(
      x1i = quote(x1i),
      x2i = quote(x2i),
      t1i = quote(t1i),
      t2i = quote(t2i)
    )
  }

  return(call_args)
}


# Internal helper to validate formula variables before evaluation.
.prepare_newdata_validate_formula_vars <- function(newdata, formula, label) {

  missing_vars <- setdiff(all.vars(formula), names(newdata))
  if (length(missing_vars) == 0L) {
    return(invisible(TRUE))
  }

  stop(
    "The 'newdata' must contain all ", label, " variables. Missing: ",
    paste(missing_vars, collapse = ", "), ".",
    call. = FALSE
  )
}


# Internal helper function to prepare newdata for prediction
# Reuses `.check_and_list_data` by constructing appropriate call and environment
#
# The newdata data.frame must contain all variables used by the requested
# prediction. Moderator and scale variables are always required when referenced
# by the fitted formulas. Outcome columns are required only when the requested
# prediction uses the sampling distribution or PET/PEESE bias terms; otherwise
# internal dummy values are inserted to keep the shared parser path.
#
# @param object A fitted brma object
# @param newdata A data.frame with new data for prediction.
# @param type Prediction type: "terms", "effect", or "response"
# @param bias_adjusted Whether PET/PEESE terms should be omitted.
#
# @return A data list equivalent to `object[["data"]]` but for `newdata`
.prepare_newdata <- function(object, newdata, type, bias_adjusted = FALSE) {

  # extract settings from the original fitted object's data attributes
  original_data <- object[["data"]]
  set_contrast_factor_predictors    <- attr(original_data, "set_contrast_factor_predictors")
  standardize_continuous_predictors <- attr(original_data, "standardize_continuous_predictors")
  effect_direction                  <- .effect_direction(object)
  outcome_type                      <- .outcome_type(object)

  newdata <- .prepare_newdata_as_data_frame(newdata)
  newdata <- .prepare_newdata_outcome(
    object        = object,
    newdata       = newdata,
    type          = type,
    bias_adjusted = bias_adjusted
  )

  # build the synthetic call expression
  # start with the base call structure
  call_args <- list(quote(.prepare_newdata_call))

  # add data argument
  call_args[["data"]] <- quote(data)

  # add outcome arguments based on outcome_type
  call_args <- c(
    call_args,
    .prepare_newdata_outcome_call_args(outcome_type, newdata)
  )

  # add mods formula if this is a regression model
  if (.is_mods(object)) {
    call_args[["mods"]] <- attr(original_data[["mods"]], "formula")
  }

  # add scale formula if present
  if (.is_scale(object)) {
    call_args[["scale"]] <- attr(original_data[["scale"]], "formula")
  }

  # add cluster structure for multilevel predictions
  if (.is_multilevel(object)) {
    if ("cluster" %in% names(newdata)) {
      call_args[["cluster"]] <- quote(cluster)
    } else {
      n_new <- nrow(newdata)
      newdata[[".RoBMA_cluster"]] <- seq_len(n_new)
      call_args[["cluster"]] <- quote(.RoBMA_cluster)
    }
  }

  # construct the call object
  .call <- as.call(call_args)

  # pre-validate that all formula variables exist in newdata
  # this prevents formulas from finding variables in parent environments
  if (.is_mods(object)) {
    mods_formula <- attr(original_data[["mods"]], "formula")
    .prepare_newdata_validate_formula_vars(newdata, mods_formula, "moderator")
  }

  if (.is_scale(object)) {
    scale_formula <- attr(original_data[["scale"]], "formula")
    .prepare_newdata_validate_formula_vars(newdata, scale_formula, "scale")
  }

  # create environment with newdata as "data"
  # use baseenv() as parent to prevent variable leakage from calling context
  .envir           <- new.env(parent = baseenv())
  .envir[["data"]] <- newdata

  # determine class and measure for .check_and_list_data
  measure    <- .measure(object)
  data_class <- switch(
    outcome_type,
    "norm" = "norm",
    "bin"  = "glmm",
    "pois" = "glmm"
  )

  # call .check_and_list_data with extracted settings
  # Use skip_validation = TRUE because:
  # 1. Newdata may have single-level factors (predicting for one group)
  # 2. Newdata may have zero-variance continuous predictors (predicting at one value)
  # 3. Factor levels are validated by BayesTools during formula evaluation
  # 4. Scaling uses original data's mean/sd stored in the fit object
  new_data <- .check_and_list_data(
    .call  = .call,
    .envir = .envir,
    class  = data_class,
    measure = measure,
    set_contrast_factor_predictors    = set_contrast_factor_predictors,
    standardize_continuous_predictors = standardize_continuous_predictors,
    effect_direction                  = effect_direction,
    skip_validation                   = TRUE
  )

  return(new_data)
}