R/fit_NBKP.R
In NBKP: Beta Kernel Process Modeling with Negative Binomial Response

Documented in fit_NBKP

#' @name fit_NBKP
#' @title Fit a Negative Binomial Kernel Process (NBKP) Model
#'
#' @description Fits a Negative Binomial Kernel Process (NBKP) model to count response data
#'   using local kernel smoothing. The method constructs a flexible latent mean surface
#'   by updating Gamma priors with kernel-weighted observations.
#'
#' @param X A numeric input matrix of size \eqn{n \times d}, where each row
#'   corresponds to a covariate vector.
#' @param y A numeric vector of observed counts (length \code{n}).
#' @param Xbounds Optional \eqn{d \times 2} matrix specifying the lower and
#'   upper bounds of each input dimension. Used to normalize inputs to
#'   \eqn{[0,1]^d}. If \code{NULL}, inputs are assumed to be pre-normalized, and
#'   default bounds \eqn{[0,1]^d} are applied.
#' @param prior Global prior type: \code{"noninformative"} (default),
#'   \code{"fixed"}, or \code{"adaptive"}.
#' @param r0 Global prior precision (used when \code{prior = "fixed"} or
#'   \code{"adaptive"}).
#' @param mu0 Global prior mean (used when \code{prior = "fixed"}). Default is
#'   \code{mean(y)}.
#' @param kernel Kernel function for local weighting: \code{"gaussian"}
#'   (default), \code{"matern52"}, or \code{"matern32"}.
#' @param loss Loss function for kernel hyperparameter tuning: \code{"mse"}
#'   (default) or \code{"nll"}.
#' @param n_multi_start Number of random initializations for multi-start
#'   optimization. Default is \eqn{10 \times d}.
#' @param theta Optional. A positive scalar or numeric vector of length \code{d}
#'   specifying kernel lengthscale parameters directly. If \code{NULL}
#'   (default), lengthscales are optimized using multi-start L-BFGS-B to
#'   minimize the specified loss.
#'
#' @return A list of class \code{"NBKP"} containing the fitted NBKP model,
#'   including:
#'   \item{\code{theta_opt}}{Optimized kernel hyperparameters (lengthscales).}
#'   \item{\code{kernel}}{Kernel function used, as a string.}
#'   \item{\code{loss}}{Loss function used for hyperparameter tuning.}
#'   \item{\code{loss_min}}{Minimum loss achieved during optimization, or
#'     \code{NA} if \code{theta} was user-specified.}
#'   \item{\code{X}}{Original input matrix (\eqn{n \times d}).}
#'   \item{\code{Xnorm}}{Normalized input matrix scaled to \eqn{[0,1]^d}.}
#'   \item{\code{Xbounds}}{Normalization bounds for each input dimension (\eqn{d \times 2}).}
#'   \item{\code{y}}{Observed counts.}
#'   \item{\code{phi}}{Estimated negative binomial dispersion parameter.}
#'   \item{\code{prior}}{Type of prior used.}
#'   \item{\code{r0}}{Prior precision parameter.}
#'   \item{\code{mu0}}{Prior mean (for fixed priors).}
#'   \item{\code{alpha0}}{Prior Gamma shape parameter \eqn{\alpha_0(\mathbf{x})}.}
#'   \item{\code{beta0}}{Prior Gamma rate parameter \eqn{\beta_0(\mathbf{x})}.}
#'   \item{\code{alpha_n}}{Posterior shape parameter \eqn{\alpha_n(\mathbf{x})}.}
#'   \item{\code{beta_n}}{Posterior rate parameter \eqn{\beta_n(\mathbf{x})}.}
#'
#' @seealso \code{\link{predict.NBKP}}, \code{\link{plot.NBKP}}, \code{\link{summary.NBKP}}
#'
#' @references Zhao J, Qing K, Xu J (2025). BKP: An R Package for Beta Kernel Process Modeling.
#'
#' @examples
#' # -------------------------- 1D Example --------------------------
#' set.seed(123)
#'
#' true_mu_fun <- function(x) {
#'   exp(sin(x) + 0.5)
#' }
#'
#' n <- 30
#' Xbounds <- matrix(c(-2, 2), nrow=1)
#' X <- matrix(seq(-2, 2, length.out = n))
#' true_mu <- true_mu_fun(X)
#' y <- rnbinom(n, size = 1, mu = true_mu)
#'
#' model1 <- fit_NBKP(X, y, Xbounds=Xbounds)
#' print(model1)
#'
#' @export
#' @importFrom stats dnbinom var
#' @importFrom lhs randomLHS
#' @importFrom optimx multistart
#' @author Xueqin Li

fit_NBKP <- function(
    X, y, Xbounds = NULL,
    prior = c("noninformative", "fixed", "adaptive"), r0 = 0.1, mu0 = mean(y),
    kernel = c("gaussian", "matern52", "matern32"),
    loss = c("mse", "nll"),
    n_multi_start = NULL, theta = NULL
) {

  # -------------------------- Argument checking --------------------------
  if (missing(X) || missing(y)) {
    stop("Arguments 'X' and 'y' must be provided.")
  }

  if (!is.matrix(X) && !is.data.frame(X)) {
    stop("'X' must be a numeric matrix or data frame.")
  }
  if (!is.numeric(as.matrix(X))) {
    stop("'X' must contain numeric values only.")
  }

  if (!is.numeric(y)) stop("'y' must be numeric.")

  X <- as.matrix(X)
  y <- matrix(y, ncol = 1)

  d <- ncol(X)
  n <- nrow(X)

  if (nrow(y) != n) stop("'y' must have the same number of rows as 'X'.")
  if (any(y < 0)) stop("'y' must be nonnegative.")
  if (anyNA(X) || anyNA(y)) stop("Missing values are not allowed in 'X' or 'y'.")

  prior <- match.arg(prior)
  kernel <- match.arg(kernel)
  loss <- match.arg(loss)

  # -------------------------- Xbounds checks --------------------------
  if (is.null(Xbounds)) {
    Xbounds <- cbind(rep(0, d), rep(1, d))
  } else {
    if (!is.matrix(Xbounds)) stop("'Xbounds' must be a numeric matrix.")
    if (!is.numeric(Xbounds)) stop("'Xbounds' must contain numeric values.")
    if (!all(dim(Xbounds) == c(d, 2))) {
      stop(paste("'Xbounds' must be a matrix with dimensions d x 2, where d =", d, "."))
    }
    if (any(Xbounds[,2] <= Xbounds[,1])) {
      stop("Each row of 'Xbounds' must satisfy lower < upper.")
    }
  }

  # -------------------------- prior parameters checks --------------------------
  if (!is.numeric(r0) || length(r0) != 1 || r0 <= 0) {
    stop("'r0' must be a positive scalar.")
  }

  if (!is.numeric(mu0) || length(mu0) != 1 || mu0 <= 0) {
    stop("'mu0' must be a positive scalar.")
  }

  # -------------------------- hyperparameters checks --------------------------
  if (!is.null(n_multi_start)) {
    if (!is.numeric(n_multi_start) || length(n_multi_start) != 1 || n_multi_start <= 0) {
      stop("'n_multi_start' must be a positive integer.")
    }
  }

  if (!is.null(theta)) {
    if (!is.numeric(theta)) stop("'theta' must be numeric.")
    if (length(theta) == 1) {
      theta <- rep(theta, d)
    } else if (length(theta) != d) {
      stop(paste("'theta' must be either a scalar or a vector of length", d, "."))
    }
    if (any(theta <= 0)) stop("'theta' must be strictly positive.")
  }

  # -------------------------- Normalize input X to [0,1]^d --------------------------
  Xnorm <- sweep(X, 2, Xbounds[,1], "-")
  Xnorm <- sweep(Xnorm, 2, Xbounds[,2] - Xbounds[,1], "/")

  # -------------------------- Estimate negative binomial dispersion parameter phi --------------------------
  mu_init <- as.numeric(mean(y))
  var_init <- as.numeric(var(y))

  if (var_init <= mu_init) {
    warning("Data variance is less than or equal to mean, no overdispersion detected.")
    phi <- 1.0
  } else {
    phi <- max(0.01, mu_init^2 / (var_init - mu_init))
  }

  # -------------------------- Define loss function for optimization --------------------------
  loss_fun_nb <- function(gamma, Xnorm, y, phi, prior, r0, mu0, kernel, loss) {
    theta <- 10^gamma
    K <- kernel_matrix(Xnorm, theta = theta, kernel = kernel)

    if (prior == "noninformative") {
      alpha0 <- rep(0.01, n)
      beta0 <- rep(0.01, n)
    } else if (prior == "fixed") {
      alpha0 <- rep(r0 * mu0, n)
      beta0 <- rep(r0, n)
    } else if (prior == "adaptive") {
      w <- K / rowSums(K)
      mu_local <- as.vector(w %*% y)
      r_local <- r0 * rowSums(K)
      alpha0 <- r_local * mu_local
      beta0 <- rep(r0, n)
    }

    alpha_n <- alpha0 + as.vector(K %*% y)
    beta_n <- beta0 + as.vector(K %*% rep(phi, n))
    mu_pred <- alpha_n / beta_n

    if (loss == "mse") {
      return(mean((y - mu_pred)^2))
    } else if (loss == "nll") {
      nll <- -sum(stats::dnbinom(y, mu = mu_pred, size = phi, log = TRUE))
      return(nll)
    }
  }

  # -------------------------- Optimize kernel parameters --------------------------
  if (is.null(theta)) {
    gamma_bounds <- matrix(c((log10(d) - log10(500))/2,
                             (log10(d) + 2)/2),
                           ncol = 2, nrow = d, byrow = TRUE)

    if (is.null(n_multi_start)) n_multi_start <- 10 * d
    init_gamma <- lhs::randomLHS(n_multi_start, d)

    opt_res <- optimx::multistart(
      parmat = init_gamma,
      fn = loss_fun_nb,
      method = "L-BFGS-B",
      lower = rep(-3, d),
      upper = rep(3, d),
      Xnorm = Xnorm, y = y, phi = phi,
      prior = prior, r0 = r0, mu0 = mu0,
      kernel = kernel, loss = loss,
      control = list(trace=0)
    )

    best_index <- which.min(opt_res$value)
    gamma_opt <- as.numeric(opt_res[best_index, 1:d])
    theta_opt <- 10^gamma_opt
    loss_min <- opt_res$value[best_index]

  } else {
    theta_opt <- theta
    loss_min <- loss_fun_nb(
      gamma = log10(theta_opt), Xnorm = Xnorm, y = y, phi = phi,
      prior = prior, r0 = r0, mu0 = mu0,
      kernel = kernel, loss = loss
    )
  }

  # -------------------------- Compute kernel matrix --------------------------
  K <- kernel_matrix(Xnorm, theta = theta_opt, kernel = kernel)

  # -------------------------- Compute prior parameters --------------------------
  if (prior == "noninformative") {
    alpha0 <- rep(0.01, n)
    beta0 <- rep(0.01, n)
  } else if (prior == "fixed") {
    alpha0 <- rep(r0 * mu0, n)
    beta0 <- rep(r0, n)
  } else if (prior == "adaptive") {
    w <- K / rowSums(K)
    mu_local <- as.vector(w %*% y)
    r_local <- r0 * rowSums(K)
    alpha0 <- r_local * mu_local
    beta0 <- rep(r0, n)
  }

  # -------------------------- Compute posterior parameters --------------------------
  alpha_n <- alpha0 + as.vector(K %*% y)
  beta_n <- beta0 + as.vector(K %*% rep(phi, n))

  # -------------------------- Return model --------------------------
  NBKP_model <- list(
    theta_opt = theta_opt, kernel = kernel,
    loss = loss, loss_min = loss_min,
    X = X, Xnorm = Xnorm, Xbounds = Xbounds, y = y,
    phi = phi,
    prior = prior, r0 = r0, mu0 = ifelse(prior == "fixed", mu0, NA),
    alpha0 = alpha0, beta0 = beta0,
    alpha_n = alpha_n, beta_n = beta_n
  )

  class(NBKP_model) <- "NBKP"
  return(NBKP_model)
}