R/boot_lucid.R

Defines functions gen_ci split_serial_boot_ci lucid_par_serial extract_serial_boot_template extract_parallel_stage_vector extract_early_stage_vector build_serial_transition_labels_boot restore_serial_Z_from_data flatten_serial_Z has_unselected_feature_serial align_boot_vector extract_parallel_gamma normalize_parallel_mu normalize_parallel_beta extract_parallel_boot_vector has_unselected_feature refit_bootstrap_zero_penalty get_model_rho_values normalize_bootstrap_model lucid_par_parallel lucid_par_early boot_lucid

Documented in boot_lucid gen_ci

#' @title Inference of LUCID model based on bootstrap resampling
#'
#' @description Generate \code{R} bootstrap replicates of LUCID parameters and
#' derive confidence interval (CI) based on bootstrap. Bootstrap replicates are
#' generated by nonparametric resampling, implemented with the \code{ordinary}
#' method of \code{boot::boot}. Supports \code{lucid_model = "early"},
#' \code{lucid_model = "parallel"}, and \code{lucid_model = "serial"}.
#'
#' @param G Exposures, a numeric vector, matrix, or data frame. Categorical variable
#' should be transformed into dummy variables. If a matrix or data frame, rows
#' represent observations and columns correspond to variables.
#' @param Z Omics data: for LUCID early integration, a numeric matrix/data frame; for LUCID in
#' parallel, a list of numeric matrices/data frames. Rows correspond to observations
#' and columns correspond to variables.
#' @param Y Outcome, a numeric vector. Categorical variable is not allowed. Binary
#' outcome should be coded as 0 and 1.
#' @param lucid_model Specifying LUCID model, "early" for early integration,
#' "parallel" for LUCID in parallel, "serial" for LUCID in serial.
#' Bootstrap inference is implemented for all three model types.
#' @param CoG Optional, covariates to be adjusted for estimating the latent cluster.
#' A numeric vector, matrix or data frame. Categorical variable should be transformed
#' into dummy variables.
#' @param CoY Optional, covariates to be adjusted for estimating the association
#' between latent cluster and the outcome. A numeric vector, matrix or data frame.
#' Categorical variable should be transformed into dummy variables.
#' @param model A LUCID model fitted by \code{estimate_lucid}.
#' If the fitted model uses nonzero penalties, \code{boot_lucid} will
#' automatically refit a zero-penalty model as fallback because bootstrap
#' inference is only supported for \code{Rho_G = Rho_Z_Mu = Rho_Z_Cov = 0}.
#' @param conf A numeric scalar between 0 and 1 to specify confidence level(s)
#' of the required interval(s).
#' @param R An integer to specify number of bootstrap replicates for LUCID model.
#' If feasible, it is recommended to set R >= 1000.
#' @param verbose A flag indicates whether detailed information
#' is printed in console. Default is FALSE.
#' 
#' @return A list containing:
#' \item{beta}{Bootstrap CI table(s) for G-to-X effects. For
#' \code{lucid_model = "parallel"}, this is a list by omics layer and includes
#' the multinomial intercept plus exposures in \code{G} (not \code{CoG}).}
#' \item{mu}{Bootstrap CI table(s) for cluster-specific means of omics features.
#' For \code{lucid_model = "parallel"}, this is a list by omics layer.}
#' \item{gamma}{Bootstrap CI table for X-to-Y parameters.}
#' \item{stage}{For \code{lucid_model = "serial"}, a list of stage-wise CI tables
#' (each stage contains \code{beta}, \code{mu}, and \code{gamma} for the final stage only).}
#' \item{bootstrap}{The \code{boot} object returned by \code{boot::boot}.}
#'
#' @export
#'
#' @import boot
#' @import progress
#'
#' @examples
#' \donttest{
#' # use simulated data
#' G <- sim_data$G[1:300, , drop = FALSE]
#' Z <- sim_data$Z[1:300, , drop = FALSE]
#' Y_normal <- sim_data$Y_normal[1:300]
#'
#' # fit lucid model
#' fit1 <- estimate_lucid(G = G, Z = Z, Y = Y_normal, lucid_model = "early", 
#' family = "normal", K = 2,
#' seed = 1008)
#'
#' # conduct bootstrap resampling
#' boot1 <- suppressWarnings(
#'   boot_lucid(G = G, Z = Z, Y = Y_normal, 
#'              lucid_model = "early", model = fit1, R = 5)
#' )
#'
#' # Use 90% CI
#' boot2 <- suppressWarnings(
#'   boot_lucid(G = G, Z = Z, Y = Y_normal, lucid_model = "early", 
#'              model = fit1, R = 5, conf = 0.9)
#' )
#' }
boot_lucid <- function(G,
                       Z,
                       Y,
                       lucid_model = c("early", "parallel","serial"),
                       CoG = NULL,
                       CoY = NULL,
                       model,
                       conf = 0.95,
                       R = 100,
                       verbose = FALSE) {
  lucid_model <- match.arg(lucid_model)
  model <- normalize_bootstrap_model(
    model = model,
    lucid_model = lucid_model,
    G = G,
    Z = Z,
    Y = Y,
    CoG = CoG,
    CoY = CoY
  )

  # prepare data for bootstrap (boot function require data in a matrix form,
  # list data structure doesn't work)
  if(!is.null(model$select) &&
     (!is.null(model$select$selectG) || !is.null(model$select$selectZ)) &&
     (has_unselected_feature(model$select$selectG) || has_unselected_feature(model$select$selectZ))) {
    stop("Refit LUCID model with selected feature first then conduct bootstrap inference")
  }
  if (lucid_model == "serial" && has_unselected_feature_serial(model)) {
    stop("Refit serial LUCID model with selected feature first then conduct bootstrap inference")
  }
  if (lucid_model == "early"){

    # ========================== Early Integration ==========================
    G <- as.matrix(G)
    Z <- as.matrix(Z)
    Y <- as.matrix(Y)
    dimG <- ncol(G)
    dimZ <- ncol(Z)
    dimCoG <- if (is.null(CoG)) 0 else ncol(as.matrix(CoG))
    dimCoY <- if (is.null(CoY)) 0 else ncol(as.matrix(CoY))
    K <- model$K
    alldata <- as.data.frame(cbind(G, Z, Y, CoG, CoY))

    # bootstrap
    if(verbose){
      cat(paste0("Use Bootstrap resampling to derive ", 100 * conf, "% CI for LUCID \n"))
    }
    #initialize progress bar object
    pb <- progress::progress_bar$new(total = R + 1)
    bootstrap <- boot::boot(data = alldata,
                            statistic = lucid_par_early,
                            R = R,
                            dimG = dimG,
                            dimZ = dimZ,
                            dimCoY = dimCoY,
                            dimCoG = dimCoG,
                            model = model,
                            prog = pb)

    # bootstrap CIs
    ci <- gen_ci(bootstrap,
                conf = conf)

    # organize CIs
    beta <- ci[1:((K - 1) * dimG), ]
    mu <- ci[((K - 1) * dimG + 1): ((K - 1) * dimG + K * dimZ), ]
    gamma <- ci[-(1:((K - 1) * dimG + K * dimZ)), ]
    return(list(beta = beta,
                mu = mu,
                gamma = gamma,
                bootstrap = bootstrap))
  } else if (lucid_model == "parallel"){
      # ========================== Lucid in Parallel ==========================
      G <- as.matrix(G)
      Gnames_exposure <- colnames(G)
      if(is.null(Gnames_exposure) || length(Gnames_exposure) != ncol(G)) {
        Gnames_exposure <- paste0("G", seq_len(ncol(G)))
      }
      Y <- as.matrix(Y)
      if(!is.list(Z)) {
        stop("For lucid_model = 'parallel', input 'Z' should be a list of matrices/data frames")
      }
      Z <- lapply(Z, as.matrix)
      nOmics <- length(Z)
      if(nOmics == 0) {
        stop("For lucid_model = 'parallel', input 'Z' should contain at least one omics layer")
      }
      dimG <- ncol(G)
      dimZ <- sapply(Z, ncol)
      dimCoG <- if (is.null(CoG)) 0 else ncol(as.matrix(CoG))
      dimCoY <- if (is.null(CoY)) 0 else ncol(as.matrix(CoY))
      K <- model$K
      if(length(K) != nOmics) {
        stop("Length of model$K does not match number of omics layers in Z")
      }

      z_combined <- do.call(cbind, Z)
      alldata <- as.data.frame(cbind(G, z_combined, Y, CoG, CoY))

      # define template from fitted model to enforce fixed bootstrap statistic length
      template <- extract_parallel_boot_vector(
        model = model,
        dimG = dimG,
        dimZ = dimZ,
        Gnames_exposure = Gnames_exposure
      )
      template_names <- names(template)
      n_template <- length(template)

      if(verbose){
        cat(paste0("Use Bootstrap resampling to derive ", 100 * conf, "% CI for LUCID in parallel \n"))
      }
      pb <- progress::progress_bar$new(total = R + 1)
      bootstrap <- boot::boot(data = alldata,
                              statistic = lucid_par_parallel,
                              R = R,
                              dimG = dimG,
                              dimZ = dimZ,
                              dimCoY = dimCoY,
                              dimCoG = dimCoG,
                              model = model,
                              Gnames_exposure = Gnames_exposure,
                              template_names = template_names,
                              n_template = n_template,
                              prog = pb)

      ci <- gen_ci(bootstrap, conf = conf)

      # split outputs by layer for easier consumption
      n_beta <- as.integer((K - 1) * (dimG + 1))
      n_mu <- as.integer(K * dimZ)
      beta <- vector("list", nOmics)
      mu <- vector("list", nOmics)
      idx_start <- 1
      for(i in seq_len(nOmics)) {
        idx_end <- idx_start + n_beta[i] - 1
        beta[[i]] <- ci[idx_start:idx_end, , drop = FALSE]
        idx_start <- idx_end + 1
      }
      for(i in seq_len(nOmics)) {
        idx_end <- idx_start + n_mu[i] - 1
        mu[[i]] <- ci[idx_start:idx_end, , drop = FALSE]
        idx_start <- idx_end + 1
      }
      gamma <- ci[idx_start:nrow(ci), , drop = FALSE]

      names(beta) <- paste0("Layer", seq_len(nOmics))
      names(mu) <- paste0("Layer", seq_len(nOmics))

      return(list(beta = beta,
                  mu = mu,
                  gamma = gamma,
                  bootstrap = bootstrap))
  } else if (lucid_model == "serial"){
      # ========================== Lucid in Serial ==========================
      G <- as.matrix(G)
      Y <- as.matrix(Y)
      dimG <- ncol(G)
      dimCoG <- if (is.null(CoG)) 0 else ncol(as.matrix(CoG))
      dimCoY <- if (is.null(CoY)) 0 else ncol(as.matrix(CoY))
      z_flat <- flatten_serial_Z(Z)
      alldata <- as.data.frame(cbind(G, z_flat$flat, Y, CoG, CoY))

      template <- extract_serial_boot_template(model)
      template_names <- names(template$vector)
      n_template <- length(template_names)

      if(verbose){
        cat(paste0("Use Bootstrap resampling to derive ", 100 * conf, "% CI for LUCID in serial \n"))
      }
      pb <- progress::progress_bar$new(total = R + 1)
      bootstrap <- boot::boot(
        data = alldata,
        statistic = lucid_par_serial,
        R = R,
        dimG = dimG,
        dimCoY = dimCoY,
        dimCoG = dimCoG,
        z_meta = z_flat$meta,
        model = model,
        template_names = template_names,
        n_template = n_template,
        prog = pb
      )

      ci <- gen_ci(bootstrap, conf = conf)
      stage_ci <- split_serial_boot_ci(ci = ci, stage_layout = template$stage_layout)

      return(list(
        stage = stage_ci,
        bootstrap = bootstrap
      ))
  }
}



# function to calculate parameters of early LUCID model. use as statisitc input for
# boot function.
lucid_par_early <- function(data, indices, model, dimG, dimZ, dimCoY, dimCoG, prog) {
  #display progress with each run of the function
  prog$tick()

  # prepare data
  d <- data[indices, ]
  G <- as.matrix(d[, 1:dimG])
  Z <- as.matrix(d[, (dimG + 1):(dimG + dimZ)])
  Y <- as.matrix(d[, (dimG + dimZ + 1)])
  CoG <- CoY <- NULL
  K <- model$K
  if(dimCoG > 0){
    CoG <- as.matrix(d[, (dimG + dimZ + 2):(dimG + dimZ + dimCoG + 1)])
  }
  if(dimCoY > 0 && dimCoG > 0){
    CoY <- as.matrix(d[, (dimG + dimZ + dimCoG + 2):ncol(d)])
  }
  if(dimCoY > 0 && dimCoG == 0){
    CoY <- as.matrix(d[, (dimG + dimZ + 2):ncol(d)])
  }

  # fit lucid model
  seed <- sample(1:2000, 1)
  em_ctrl <- model$em_control
  rG <- if(!is.null(model$Rho$Rho_G)) model$Rho$Rho_G else 0
  rMu <- if(!is.null(model$Rho$Rho_Z_Mu)) model$Rho$Rho_Z_Mu else 0
  rCov <- if(!is.null(model$Rho$Rho_Z_Cov)) model$Rho$Rho_Z_Cov else 0
  tol_fit <- if(!is.null(em_ctrl$tol)) em_ctrl$tol else 0.001
  max_itr_fit <- if(!is.null(em_ctrl$max_itr)) em_ctrl$max_itr else 1000
  max_tot_fit <- if(!is.null(em_ctrl$max_tot.itr)) em_ctrl$max_tot.itr else 10000
  invisible(capture.output(try_lucid <- try(est_lucid(G = G,
                                                      Z = Z,
                                                      Y = Y,
                                                      CoY = CoY,
                                                      CoG = CoG,
                                                      lucid_model = "early",
                                                      family = model$family,
                                                      init_omic.data.model = model$init_omic.data.model,
                                                      K = K,
                                                      tol = tol_fit,
                                                      max_itr = max_itr_fit,
                                                      max_tot.itr = max_tot_fit,
                                                      Rho_G = rG,
                                                      Rho_Z_Mu = rMu,
                                                      Rho_Z_Cov = rCov,
                                                      init_impute = model$init_impute,
                                                      init_par = model$init_par,
                                                      seed = seed))))
  if("try-error" %in% class(try_lucid)){
    n_par <- (K - 1) * dimG + K * dimZ + length(model$res_Gamma$beta)
    par_lucid <- rep(NA_real_, n_par)
  } else{
    beta_col_idx <- 1 + seq_len(dimG)
    beta_exposure <- try_lucid$res_Beta[-1, beta_col_idx, drop = FALSE]
    par_lucid <- c(as.vector(t(beta_exposure)),
                   as.vector(t(try_lucid$res_Mu)),
                   try_lucid$res_Gamma$beta)
    G_names <- as.vector(sapply(2:K, function(x) {
      paste0(colnames(try_lucid$res_Beta)[beta_col_idx],
             ".cluster", x)
    }))
    Z_names <- as.vector(sapply(1:K, function(x) {
      paste0(colnames(try_lucid$res_Mu),
             ".cluster", x)
    }))
    if(is.null(names(try_lucid$res_Gamma$beta))) {
      Y_names <- paste0("cluster", 1:K)
    } else {
      Y_names <- names(try_lucid$res_Gamma$beta)
    }
    names(par_lucid) <- c(G_names, Z_names, Y_names)
    converge <- TRUE
  }
  return(par_lucid)
}


# function to calculate parameters of parallel LUCID model. use as statistic input for
# boot function.
lucid_par_parallel <- function(data, indices, model, dimG, dimZ, dimCoY, dimCoG,
                               Gnames_exposure, template_names, n_template, prog) {
  prog$tick()

  d <- data[indices, , drop = FALSE]
  col_start <- 1

  G <- as.matrix(d[, col_start:(col_start + dimG - 1), drop = FALSE])
  col_start <- col_start + dimG

  nOmics <- length(dimZ)
  Z <- vector("list", nOmics)
  for(i in seq_len(nOmics)) {
    zdim <- dimZ[i]
    Z[[i]] <- as.matrix(d[, col_start:(col_start + zdim - 1), drop = FALSE])
    col_start <- col_start + zdim
  }

  Y <- as.matrix(d[, col_start, drop = FALSE])
  col_start <- col_start + 1

  CoG <- CoY <- NULL
  if(dimCoG > 0) {
    CoG <- as.matrix(d[, col_start:(col_start + dimCoG - 1), drop = FALSE])
    col_start <- col_start + dimCoG
  }
  if(dimCoY > 0) {
    CoY <- as.matrix(d[, col_start:(col_start + dimCoY - 1), drop = FALSE])
  }

  seed <- sample(1:2000, 1)
  rG <- if(!is.null(model$Rho$Rho_G)) model$Rho$Rho_G else 0
  rMu <- if(!is.null(model$Rho$Rho_Z_Mu)) model$Rho$Rho_Z_Mu else 0
  rCov <- if(!is.null(model$Rho$Rho_Z_Cov)) model$Rho$Rho_Z_Cov else 0
  em_ctrl <- model$em_control
  tol_fit <- if(!is.null(em_ctrl$tol)) em_ctrl$tol else 0.001
  max_itr_fit <- if(!is.null(em_ctrl$max_itr)) em_ctrl$max_itr else 1000
  max_tot_fit <- if(!is.null(em_ctrl$max_tot.itr)) em_ctrl$max_tot.itr else 10000

  invisible(capture.output(try_lucid <- try(est_lucid(
    G = G,
    Z = Z,
    Y = Y,
    CoY = CoY,
    CoG = CoG,
    lucid_model = "parallel",
    family = model$family,
    init_omic.data.model = model$init_omic.data.model,
    K = model$K,
    tol = tol_fit,
    max_itr = max_itr_fit,
    max_tot.itr = max_tot_fit,
    init_impute = model$init_impute,
    init_par = model$init_par,
    useY = model$useY,
    Rho_G = rG,
    Rho_Z_Mu = rMu,
    Rho_Z_Cov = rCov,
    seed = seed
  ), silent = TRUE)))

  if("try-error" %in% class(try_lucid)) {
    par_lucid <- rep(NA_real_, n_template)
    names(par_lucid) <- template_names
  } else {
    par_raw <- extract_parallel_boot_vector(
      model = try_lucid,
      dimG = dimG,
      dimZ = dimZ,
      Gnames_exposure = Gnames_exposure
    )
    par_lucid <- align_boot_vector(par_raw, template_names = template_names)
  }

  return(par_lucid)
}


# Normalize input model for bootstrap:
# bootstrap CI is only supported on zero-penalty models.
normalize_bootstrap_model <- function(model, lucid_model, G, Z, Y, CoG = NULL, CoY = NULL) {
  rho_vals <- get_model_rho_values(model)
  if (all(abs(rho_vals) <= sqrt(.Machine$double.eps))) {
    return(model)
  }

  warning(
    paste0(
      "Bootstrap CI is only supported for zero-penalty models ",
      "(Rho_G = Rho_Z_Mu = Rho_Z_Cov = 0). ",
      "Detected nonzero penalty (Rho_G=", format(rho_vals[1], digits = 6),
      ", Rho_Z_Mu=", format(rho_vals[2], digits = 6),
      ", Rho_Z_Cov=", format(rho_vals[3], digits = 6),
      "). Falling back to a zero-penalty refit before bootstrap."
    ),
    call. = FALSE
  )

  refit_bootstrap_zero_penalty(
    model = model,
    lucid_model = lucid_model,
    G = G,
    Z = Z,
    Y = Y,
    CoG = CoG,
    CoY = CoY
  )
}


get_model_rho_values <- function(model) {
  get_scalar <- function(x) {
    if (is.null(x)) return(0)
    v <- suppressWarnings(as.numeric(x))
    v <- v[is.finite(v)]
    if (length(v) == 0) return(0)
    v[1]
  }
  rho <- model$Rho
  c(
    Rho_G = get_scalar(rho$Rho_G),
    Rho_Z_Mu = get_scalar(rho$Rho_Z_Mu),
    Rho_Z_Cov = get_scalar(rho$Rho_Z_Cov)
  )
}


refit_bootstrap_zero_penalty <- function(model, lucid_model, G, Z, Y, CoG = NULL, CoY = NULL) {
  em_ctrl <- model$em_control
  tol_fit <- if(!is.null(em_ctrl$tol)) em_ctrl$tol else 0.001
  max_itr_fit <- if(!is.null(em_ctrl$max_itr)) em_ctrl$max_itr else 1000
  max_tot_fit <- if(!is.null(em_ctrl$max_tot.itr)) em_ctrl$max_tot.itr else 10000
  seed_fit <- if(!is.null(model$seed) && length(model$seed) > 0 && is.finite(model$seed[1])) {
    as.integer(model$seed[1])
  } else {
    123
  }
  useY_fit <- if(!is.null(model$useY)) model$useY else TRUE

  invisible(capture.output(
    refit_try <- try(
      estimate_lucid(
        lucid_model = lucid_model,
        G = G,
        Z = Z,
        Y = Y,
        CoG = CoG,
        CoY = CoY,
        K = model$K,
        init_omic.data.model = model$init_omic.data.model,
        useY = useY_fit,
        tol = tol_fit,
        max_itr = max_itr_fit,
        max_tot.itr = max_tot_fit,
        Rho_G = 0,
        Rho_Z_Mu = 0,
        Rho_Z_Cov = 0,
        family = model$family,
        seed = seed_fit,
        init_impute = model$init_impute,
        init_par = model$init_par,
        verbose = FALSE
      ),
      silent = TRUE
    )
  ))

  if("try-error" %in% class(refit_try)) {
    stop(
      paste0(
        "Bootstrap CI requires a zero-penalty model and fallback refit failed. ",
        "Please fit estimate_lucid(..., Rho_G = 0, Rho_Z_Mu = 0, Rho_Z_Cov = 0) and retry."
      )
    )
  }
  refit_try
}


# recursively detect if any feature is unselected
has_unselected_feature <- function(x) {
  if(is.null(x)) {
    return(FALSE)
  }
  if(is.list(x)) {
    return(any(vapply(x, has_unselected_feature, logical(1))))
  }
  x_logical <- as.logical(x)
  any(!x_logical, na.rm = TRUE)
}


# Extract a fixed-shape vector of parallel parameters:
# (all layer beta exposure effects for non-reference clusters,
#  all layer mu values, gamma coefficients).
extract_parallel_boot_vector <- function(model, dimG, dimZ, Gnames_exposure = NULL) {
  K <- as.integer(model$K)
  nOmics <- length(K)

  if(!is.null(Gnames_exposure) && length(Gnames_exposure) == dimG) {
    Gnames <- Gnames_exposure
  } else {
    Gnames <- model$var.names$Gnames
    if(!is.null(Gnames) && length(Gnames) >= dimG) {
      Gnames <- Gnames[seq_len(dimG)]
    } else {
      Gnames <- paste0("G", seq_len(dimG))
    }
  }

  Znames <- model$var.names$Znames
  if(is.null(Znames) || length(Znames) != nOmics) {
    Znames <- lapply(seq_len(nOmics), function(i) paste0("Z", i, "_", seq_len(dimZ[i])))
  }

  beta_all <- NULL
  beta_names <- NULL
  beta_list <- model$res_Beta$Beta
  if(is.null(beta_list) && is.list(model$res_Beta)) {
    beta_list <- model$res_Beta
  }

  for(i in seq_len(nOmics)) {
    beta_mat <- normalize_parallel_beta(beta_i = beta_list[[i]], K_i = K[i], Gnames = Gnames)
    beta_vec <- as.vector(t(beta_mat))
    beta_name_i <- as.vector(sapply(2:K[i], function(k) {
      paste0("Layer", i, ".", colnames(beta_mat), ".cluster", k)
    }))
    beta_all <- c(beta_all, beta_vec)
    beta_names <- c(beta_names, beta_name_i)
  }

  mu_all <- NULL
  mu_names <- NULL
  for(i in seq_len(nOmics)) {
    z_names_i <- Znames[[i]]
    if(is.null(z_names_i) || length(z_names_i) != dimZ[i]) {
      z_names_i <- paste0("Z", i, "_", seq_len(dimZ[i]))
    }
    mu_mat <- normalize_parallel_mu(mu_i = model$res_Mu[[i]], K_i = K[i], Znames_i = z_names_i)
    mu_vec <- as.vector(t(mu_mat))
    mu_name_i <- as.vector(sapply(seq_len(K[i]), function(k) {
      paste0("Layer", i, ".", z_names_i, ".cluster", k)
    }))
    mu_all <- c(mu_all, mu_vec)
    mu_names <- c(mu_names, mu_name_i)
  }

  gamma_all <- extract_parallel_gamma(model)
  gamma_names <- names(gamma_all)
  if(is.null(gamma_names)) {
    gamma_names <- paste0("gamma", seq_along(gamma_all))
  }
  gamma_names <- paste0("Y.", gamma_names)

  par <- c(beta_all, mu_all, as.numeric(gamma_all))
  names(par) <- c(beta_names, mu_names, gamma_names)
  par
}


normalize_parallel_beta <- function(beta_i, K_i, Gnames) {
  K_i <- as.integer(K_i)
  dimG <- length(Gnames)
  target <- matrix(NA_real_, nrow = max(K_i - 1, 0), ncol = dimG + 1)
  colnames(target) <- c("(Intercept)", Gnames)
  if(K_i <= 1 || is.null(beta_i)) {
    return(target)
  }

  if(is.null(dim(beta_i))) {
    beta_i <- matrix(beta_i, nrow = 1)
  } else {
    beta_i <- as.matrix(beta_i)
  }

  # keep non-reference clusters only
  if(nrow(beta_i) == K_i) {
    beta_i <- beta_i[2:K_i, , drop = FALSE]
  } else if(nrow(beta_i) >= (K_i - 1)) {
    beta_i <- beta_i[seq_len(K_i - 1), , drop = FALSE]
  }

  beta_exposure <- matrix(NA_real_, nrow = nrow(beta_i), ncol = dimG + 1)
  colnames(beta_exposure) <- c("(Intercept)", Gnames)

  if(!is.null(colnames(beta_i))) {
    idx_int <- match("(Intercept)", colnames(beta_i))
    if(!is.na(idx_int)) {
      beta_exposure[, 1] <- beta_i[, idx_int, drop = TRUE]
    } else if(ncol(beta_i) >= 1) {
      beta_exposure[, 1] <- beta_i[, 1, drop = TRUE]
    }
    idx <- match(Gnames, colnames(beta_i))
    valid <- which(!is.na(idx))
    if(length(valid) > 0) {
      beta_exposure[, 1 + valid] <- beta_i[, idx[valid], drop = FALSE]
    } else {
      # Fallback for naming mismatches: use positional exposure columns.
      offset <- if(ncol(beta_i) >= 1 && grepl("Intercept", colnames(beta_i)[1], fixed = TRUE)) 2 else 1
      if(offset <= ncol(beta_i)) {
        n_fill <- min(dimG, ncol(beta_i) - offset + 1)
        if(n_fill > 0) {
          beta_exposure[, 1 + seq_len(n_fill)] <- beta_i[, seq.int(offset, length.out = n_fill), drop = FALSE]
        }
      }
    }
  } else {
    if(ncol(beta_i) >= 1) {
      beta_exposure[, 1] <- beta_i[, 1, drop = TRUE]
    }
    if(ncol(beta_i) >= (dimG + 1)) {
      beta_exposure[, 1 + seq_len(dimG)] <- beta_i[, 2:(dimG + 1), drop = FALSE]
    } else {
      n_fill <- min(dimG, ncol(beta_i))
      if(n_fill > 0) {
        beta_exposure[, 1 + seq_len(n_fill)] <- beta_i[, seq_len(n_fill), drop = FALSE]
      }
    }
  }

  n_fill <- min(nrow(target), nrow(beta_exposure))
  if(n_fill > 0) {
    target[seq_len(n_fill), ] <- beta_exposure[seq_len(n_fill), , drop = FALSE]
  }
  target
}


normalize_parallel_mu <- function(mu_i, K_i, Znames_i) {
  K_i <- as.integer(K_i)
  dimZ_i <- length(Znames_i)
  target <- matrix(NA_real_, nrow = K_i, ncol = dimZ_i)
  colnames(target) <- Znames_i
  if(is.null(mu_i)) {
    return(target)
  }

  if(is.null(dim(mu_i))) {
    mu_i <- matrix(mu_i, nrow = K_i)
  } else {
    mu_i <- as.matrix(mu_i)
  }

  if(nrow(mu_i) == dimZ_i && ncol(mu_i) == K_i) {
    mu_i <- t(mu_i)
  }

  n_row <- min(K_i, nrow(mu_i))
  n_col <- min(dimZ_i, ncol(mu_i))
  if(n_row > 0 && n_col > 0) {
    target[seq_len(n_row), seq_len(n_col)] <- mu_i[seq_len(n_row), seq_len(n_col), drop = FALSE]
  }
  target
}


extract_parallel_gamma <- function(model) {
  gamma <- NULL
  if(!is.null(model$res_Gamma$fit)) {
    gamma <- try(coef(model$res_Gamma$fit), silent = TRUE)
    if(inherits(gamma, "try-error")) {
      gamma <- NULL
    }
  }
  if(is.null(gamma)) {
    family_parallel <- to_parallel_family(model$family)
    if(family_parallel == "gaussian") {
      gamma <- model$res_Gamma$Gamma$mu
    } else {
      gamma <- model$res_Gamma$fit$coefficients
    }
  }
  gamma
}


align_boot_vector <- function(par_raw, template_names) {
  par <- rep(NA_real_, length(template_names))
  names(par) <- template_names
  if(length(par_raw) == 0) {
    return(par)
  }

  par_names <- names(par_raw)
  if(!is.null(par_names) && all(!is.na(par_names))) {
    idx <- match(par_names, template_names)
    valid <- which(!is.na(idx))
    # Use name-based alignment only when complete; otherwise use positional fallback.
    if(length(valid) == length(par_raw)) {
      par[idx[valid]] <- as.numeric(par_raw[valid])
      return(par)
    }
  }

  n_fill <- min(length(par), length(par_raw))
  par[seq_len(n_fill)] <- as.numeric(par_raw[seq_len(n_fill)])
  par
}


has_unselected_feature_serial <- function(model) {
  if (is.null(model$submodel) || !is.list(model$submodel)) {
    return(FALSE)
  }
  any(vapply(model$submodel, function(sm) {
    if (is.null(sm$select)) {
      return(FALSE)
    }
    has_unselected_feature(sm$select$selectG) || has_unselected_feature(sm$select$selectZ)
  }, logical(1)))
}


flatten_serial_Z <- function(Z) {
  leaf_mats <- list()
  leaf_id <- 0L
  recurse <- function(node) {
    if (is.list(node)) {
      children <- vector("list", length(node))
      for (i in seq_along(node)) {
        children[[i]] <- recurse(node[[i]])
      }
      names(children) <- names(node)
      return(list(kind = "list", children = children, names = names(node)))
    }
    z_mat <- as.matrix(node)
    if (!is.numeric(z_mat)) {
      stop("All serial Z blocks must be numeric.")
    }
    leaf_id <<- leaf_id + 1L
    leaf_mats[[leaf_id]] <<- z_mat
    list(kind = "leaf", leaf_id = leaf_id, ncol = ncol(z_mat), colnames = colnames(z_mat))
  }
  meta <- recurse(Z)
  flat <- do.call(cbind, leaf_mats)
  list(flat = flat, meta = meta)
}


restore_serial_Z_from_data <- function(d, col_start, z_meta) {
  idx <- as.integer(col_start)
  recurse <- function(meta) {
    if (identical(meta$kind, "leaf")) {
      cols <- idx:(idx + meta$ncol - 1L)
      z_mat <- as.matrix(d[, cols, drop = FALSE])
      if (!is.null(meta$colnames) && length(meta$colnames) == meta$ncol) {
        colnames(z_mat) <- meta$colnames
      }
      idx <<- idx + meta$ncol
      return(z_mat)
    }
    out <- vector("list", length(meta$children))
    for (i in seq_along(meta$children)) {
      out[[i]] <- recurse(meta$children[[i]])
    }
    names(out) <- meta$names
    out
  }
  list(Z = recurse(z_meta), next_col = idx)
}


build_serial_transition_labels_boot <- function(submodels) {
  n_stage <- length(submodels)
  out <- vector("list", n_stage)
  out[[1]] <- character(0)
  for (i in seq.int(2, n_stage)) {
    prev <- submodels[[i - 1L]]
    if (inherits(prev, "early_lucid")) {
      k_prev <- as.integer(prev$K)
      if (length(k_prev) > 0 && !is.na(k_prev) && k_prev > 1) {
        out[[i]] <- paste0("Stage", i - 1L, ".cluster", seq.int(2, k_prev))
      } else {
        out[[i]] <- character(0)
      }
    } else if (inherits(prev, "lucid_parallel")) {
      k_prev <- as.integer(prev$K)
      labels <- character(0)
      for (layer_idx in seq_along(k_prev)) {
        if (!is.na(k_prev[layer_idx]) && k_prev[layer_idx] > 1) {
          labels <- c(labels, paste0("Stage", i - 1L, ".Layer", layer_idx,
                                     ".cluster", seq.int(2, k_prev[layer_idx])))
        }
      }
      out[[i]] <- labels
    } else {
      out[[i]] <- character(0)
    }
  }
  out
}


extract_early_stage_vector <- function(stage_model, is_last_stage, transition_labels = character(0)) {
  K <- as.integer(stage_model$K)
  beta_mat <- as.matrix(stage_model$res_Beta)
  if (is.null(dim(beta_mat))) {
    beta_mat <- matrix(beta_mat, nrow = 1)
  }
  if (nrow(beta_mat) == K) {
    beta_use <- beta_mat[2:K, , drop = FALSE]
    cluster_ids <- 2:K
  } else if (nrow(beta_mat) == (K - 1)) {
    beta_use <- beta_mat
    cluster_ids <- 2:K
  } else {
    beta_use <- beta_mat
    cluster_ids <- seq_len(nrow(beta_use))
  }
  if (nrow(beta_use) == 0) {
    beta_use <- matrix(numeric(0), nrow = 0, ncol = ncol(beta_mat))
    cluster_ids <- integer(0)
  }
  n_feat <- max(0, ncol(beta_use) - 1L)
  if (length(transition_labels) > 0 && n_feat > 0) {
    feat_names <- transition_labels[seq_len(min(length(transition_labels), n_feat))]
    if (length(feat_names) < n_feat) {
      feat_names <- c(feat_names, paste0("PrevStageCluster", seq.int(length(feat_names) + 1L, n_feat)))
    }
    beta_col_names <- c("(Intercept)", feat_names)
  } else {
    beta_col_names <- colnames(beta_use)
    if (is.null(beta_col_names) || length(beta_col_names) != ncol(beta_use)) {
      beta_col_names <- c("(Intercept)", paste0("G", seq_len(n_feat)))
    } else {
      beta_col_names[1] <- "(Intercept)"
    }
  }
  if (ncol(beta_use) == length(beta_col_names)) {
    colnames(beta_use) <- beta_col_names
  }
  beta_vec <- as.numeric(t(beta_use))
  beta_names <- as.vector(sapply(cluster_ids, function(k) paste0(beta_col_names, ".cluster", k)))
  if (length(beta_vec) == length(beta_names)) {
    names(beta_vec) <- beta_names
  }

  mu_mat <- as.matrix(stage_model$res_Mu)
  if (is.null(dim(mu_mat))) {
    mu_mat <- matrix(mu_mat, nrow = K)
  }
  if (nrow(mu_mat) != K && ncol(mu_mat) == K) {
    mu_mat <- t(mu_mat)
  }
  z_names <- stage_model$var.names$Znames
  if (is.null(z_names) || length(z_names) != ncol(mu_mat)) {
    z_names <- paste0("Z", seq_len(ncol(mu_mat)))
  }
  colnames(mu_mat) <- z_names
  mu_vec <- as.numeric(t(mu_mat))
  mu_names <- as.vector(sapply(seq_len(nrow(mu_mat)), function(k) paste0(z_names, ".cluster", k)))
  if (length(mu_vec) == length(mu_names)) {
    names(mu_vec) <- mu_names
  }

  gamma_vec <- numeric(0)
  if (isTRUE(is_last_stage)) {
    gamma_raw <- stage_model$res_Gamma$beta
    gamma_vec <- as.numeric(gamma_raw)
    gamma_names <- names(gamma_raw)
    if (is.null(gamma_names) || length(gamma_names) != length(gamma_vec)) {
      gamma_names <- paste0("gamma", seq_along(gamma_vec))
    }
    names(gamma_vec) <- paste0("Y.", gamma_names)
  }
  vec <- c(beta_vec, mu_vec, gamma_vec)
  list(vec = vec, layout = list(type = "early", n_beta = length(beta_vec),
                                n_mu = length(mu_vec), n_gamma = length(gamma_vec)))
}


extract_parallel_stage_vector <- function(stage_model, is_last_stage, transition_labels = character(0)) {
  K <- as.integer(stage_model$K)
  dimZ <- as.integer(sapply(stage_model$Z, ncol))
  if (length(transition_labels) > 0) {
    g_names <- transition_labels
  } else {
    g_names <- stage_model$var.names$Gnames
    if (is.null(g_names) || length(g_names) == 0) {
      b1 <- stage_model$res_Beta$Beta[[1]]
      p <- if (!is.null(b1)) max(0, ncol(as.matrix(b1)) - 1L) else 0L
      g_names <- paste0("G", seq_len(p))
    }
  }
  dimG_stage <- length(g_names)
  par <- extract_parallel_boot_vector(
    model = stage_model,
    dimG = dimG_stage,
    dimZ = dimZ,
    Gnames_exposure = g_names
  )
  n_beta_layer <- as.integer((K - 1L) * (dimG_stage + 1L))
  n_mu_layer <- as.integer(K * dimZ)
  n_beta <- sum(n_beta_layer)
  n_mu <- sum(n_mu_layer)
  n_keep <- n_beta + n_mu
  if (!isTRUE(is_last_stage)) {
    par <- par[seq_len(n_keep)]
  }
  n_gamma <- max(0L, length(par) - n_keep)
  list(vec = par, layout = list(type = "parallel", n_beta = n_beta, n_mu = n_mu,
                                n_gamma = n_gamma, n_beta_layer = n_beta_layer,
                                n_mu_layer = n_mu_layer))
}


extract_serial_boot_template <- function(model) {
  if (is.null(model$submodel) || !is.list(model$submodel) || length(model$submodel) == 0) {
    stop("Input serial model does not contain valid submodels.")
  }
  submodels <- model$submodel
  n_stage <- length(submodels)
  transition_labels <- build_serial_transition_labels_boot(submodels)
  vec_all <- numeric(0)
  stage_layout <- vector("list", n_stage)
  idx_start <- 1L
  for (i in seq_len(n_stage)) {
    is_last <- (i == n_stage)
    sm <- submodels[[i]]
    stage_obj <- if (inherits(sm, "early_lucid")) {
      extract_early_stage_vector(sm, is_last_stage = is_last, transition_labels = transition_labels[[i]])
    } else if (inherits(sm, "lucid_parallel")) {
      extract_parallel_stage_vector(sm, is_last_stage = is_last, transition_labels = transition_labels[[i]])
    } else {
      stop("Unsupported submodel class in serial bootstrap.")
    }
    stage_vec <- as.numeric(stage_obj$vec)
    local_names <- names(stage_obj$vec)
    if (is.null(local_names) || length(local_names) != length(stage_vec)) {
      local_names <- paste0("param", seq_along(stage_vec))
    }
    prefixed_names <- paste0("Stage", i, "::", local_names)
    names(stage_vec) <- prefixed_names
    idx_end <- idx_start + length(stage_vec) - 1L
    stage_layout[[i]] <- c(stage_obj$layout, list(start = idx_start, end = idx_end,
                                                  local_names = local_names,
                                                  prefixed_names = prefixed_names))
    vec_all <- c(vec_all, stage_vec)
    idx_start <- idx_end + 1L
  }
  list(vector = vec_all, stage_layout = stage_layout)
}


lucid_par_serial <- function(data, indices, model, dimG, dimCoY, dimCoG,
                             z_meta, template_names, n_template, prog) {
  prog$tick()
  d <- data[indices, , drop = FALSE]
  col_start <- 1L
  G <- as.matrix(d[, col_start:(col_start + dimG - 1L), drop = FALSE])
  col_start <- col_start + dimG
  z_rec <- restore_serial_Z_from_data(d = d, col_start = col_start, z_meta = z_meta)
  Z <- z_rec$Z
  col_start <- z_rec$next_col
  Y <- as.matrix(d[, col_start, drop = FALSE])
  col_start <- col_start + 1L
  CoG <- CoY <- NULL
  if (dimCoG > 0) {
    CoG <- as.matrix(d[, col_start:(col_start + dimCoG - 1L), drop = FALSE])
    col_start <- col_start + dimCoG
  }
  if (dimCoY > 0) {
    CoY <- as.matrix(d[, col_start:(col_start + dimCoY - 1L), drop = FALSE])
  }
  seed <- sample(1:2000, 1)
  rG <- if (!is.null(model$Rho$Rho_G)) model$Rho$Rho_G else 0
  rMu <- if (!is.null(model$Rho$Rho_Z_Mu)) model$Rho$Rho_Z_Mu else 0
  rCov <- if (!is.null(model$Rho$Rho_Z_Cov)) model$Rho$Rho_Z_Cov else 0
  em_ctrl <- model$em_control
  tol_fit <- if (!is.null(em_ctrl$tol)) em_ctrl$tol else 0.001
  max_itr_fit <- if (!is.null(em_ctrl$max_itr)) em_ctrl$max_itr else 1000
  max_tot_fit <- if (!is.null(em_ctrl$max_tot.itr)) em_ctrl$max_tot.itr else 10000
  invisible(capture.output(try_lucid <- try(estimate_lucid(
    G = G, Z = Z, Y = Y, CoY = CoY, CoG = CoG,
    lucid_model = "serial", family = model$family,
    init_omic.data.model = model$init_omic.data.model, K = model$K,
    tol = tol_fit, max_itr = max_itr_fit, max_tot.itr = max_tot_fit,
    init_impute = model$init_impute, init_par = model$init_par,
    useY = model$useY, Rho_G = rG, Rho_Z_Mu = rMu, Rho_Z_Cov = rCov,
    seed = seed
  ), silent = TRUE)))
  if ("try-error" %in% class(try_lucid)) {
    par_lucid <- rep(NA_real_, n_template)
    names(par_lucid) <- template_names
    return(par_lucid)
  }
  par_raw <- extract_serial_boot_template(try_lucid)$vector
  align_boot_vector(par_raw = par_raw, template_names = template_names)
}


split_serial_boot_ci <- function(ci, stage_layout) {
  out <- vector("list", length(stage_layout))
  for (i in seq_along(stage_layout)) {
    lay <- stage_layout[[i]]
    stage_ci <- ci[lay$start:lay$end, , drop = FALSE]
    rownames(stage_ci) <- lay$local_names
    nb <- lay$n_beta
    nm <- lay$n_mu
    ng <- lay$n_gamma
    beta_ci <- if (nb > 0) stage_ci[seq_len(nb), , drop = FALSE] else NULL
    mu_ci <- if (nm > 0) stage_ci[seq.int(nb + 1L, nb + nm), , drop = FALSE] else NULL
    gamma_ci <- if (ng > 0) stage_ci[seq.int(nb + nm + 1L, nb + nm + ng), , drop = FALSE] else NULL
    if (identical(lay$type, "parallel")) {
      beta_list <- vector("list", length(lay$n_beta_layer))
      mu_list <- vector("list", length(lay$n_mu_layer))
      names(beta_list) <- paste0("Layer", seq_along(beta_list))
      names(mu_list) <- paste0("Layer", seq_along(mu_list))
      if (!is.null(beta_ci)) {
        st <- 1L
        for (j in seq_along(lay$n_beta_layer)) {
          nj <- lay$n_beta_layer[j]
          beta_list[[j]] <- beta_ci[seq.int(st, st + nj - 1L), , drop = FALSE]
          st <- st + nj
        }
      }
      if (!is.null(mu_ci)) {
        st <- 1L
        for (j in seq_along(lay$n_mu_layer)) {
          nj <- lay$n_mu_layer[j]
          mu_list[[j]] <- mu_ci[seq.int(st, st + nj - 1L), , drop = FALSE]
          st <- st + nj
        }
      }
      out[[i]] <- list(beta = beta_list, mu = mu_list, gamma = gamma_ci)
    } else {
      out[[i]] <- list(beta = beta_ci, mu = mu_ci, gamma = gamma_ci)
    }
  }
  out
}


#' @title generate bootstrp ci (normal, basic and percentile)
#'
#' @param x an object return by boot function
#' @param conf A numeric scalar between 0 and 1 to specify confidence level(s)
#' of the required interval(s).
#'
#' @return a matrix, the first column is the point estimate from original model
#'
gen_ci <- function(x, conf = 0.95) {
  t0 <- x$t0
  res_ci <- NULL
  for (i in 1:length(t0)) {
    ci <- try(boot::boot.ci(x,
                            index = i,
                            conf = conf,
                            type = c("norm", "perc")), silent = TRUE)
    if("try-error" %in% class(ci)) {
      temp_ci <- rep(NA_real_, 4)
    } else {
      norm_ci <- if(!is.null(ci$normal) && length(ci$normal) >= 3) ci$normal[2:3] else c(NA_real_, NA_real_)
      perc_ci <- if(!is.null(ci$percent) && length(ci$percent) >= 5) ci$percent[4:5] else c(NA_real_, NA_real_)
      temp_ci <- c(norm_ci, perc_ci)
    }
    res_ci <- rbind(res_ci,
                    temp_ci)
  }
  res <- cbind(t0, res_ci)
  colnames(res) <- c("estimate",
                     "norm_lower", "norm_upper",
                     "perc_lower", "perc_upper")
  return(res)
}

Try the LUCIDus package in your browser

Any scripts or data that you put into this service are public.

LUCIDus documentation built on March 11, 2026, 9:06 a.m.