Nothing
#' Cross-validation for boosting after multiple imputation (pre-imputed inputs)
#'
#' To avoid data leakage, each CV fold should first be split into training and
#' validation subsets, after which imputation is performed. For the final model,
#' all data should be imputed independently.
#'
#' The recommended workflow is illustrated in the examples.
#'
#' @references
#' Kuchen, R. (2025). \emph{MIBoost: A Gradient Boosting Algorithm for Variable
#' Selection After Multiple Imputation}. arXiv:2507.21807.
#' \doi{10.48550/arXiv.2507.21807} \url{https://arxiv.org/abs/2507.21807}.
#'
#' @param X_train_list A list of length \eqn{k}. Element \code{i} is itself a
#' list of length \eqn{M} containing the \eqn{n_{train} \times p} numeric
#' design matrices for each imputed dataset in CV fold \code{i}.
#' @param y_train_list A list of length \eqn{k}. Element \code{i} is a list of
#' length \eqn{M}, where each element is a length-\eqn{n_{train}} numeric
#' response vector aligned with \code{X_train_list[[i]][[m]]}.
#' @param X_val_list A list of length \eqn{k}. Element \code{i} is a list of
#' length \eqn{M} containing the \eqn{n_{val} \times p} numeric validation
#' matrices matched to the corresponding imputed training dataset
#' in fold \code{i}.
#' @param y_val_list A list of length \eqn{k}. Element \code{i} is a list of
#' length \eqn{M}, where each element is a length-\eqn{n_{val}} continuous
#' response vector aligned with \code{X_val_list[[i]][[m]]}.
#' @param X_full A list of length \eqn{M} with the \eqn{n \times p} numeric
#' full-data design matrices (one per imputed dataset) used to fit
#' the final model.
#' @param y_full A list of length \eqn{M}, where each element is a
#' length-\eqn{n} continuous response vector corresponding to the
#' imputed dataset in \code{X_full}.
#' @param ny Learning rate. Defaults to \code{0.1}.
#' @param mstop Maximum number of boosting iterations to evaluate during
#' cross-validation. The selected \code{mstop} is the value that minimizes
#' the mean CV error over \code{1:mstop}. Default is \code{250}.
#' @param type Type of loss function. One of:
#' \code{"gaussian"} (mean squared error) for continuous responses,
#' or \code{"logistic"} (binomial deviance) for binary responses.
#' @param MIBoost Logical. If \code{TRUE}, applies the MIBoost algorithm,
#' which enforces uniform variable selection across all imputed datasets. If
#' \code{FALSE}, variables are selected independently within each imputed
#' dataset, and pooling is governed by \code{pool_threshold}.
#' @param pool Logical. If \code{TRUE}, models across the \eqn{M} imputed
#' datasets are aggregated into a single final model. If \code{FALSE},
#' \eqn{M} separate models are returned.
#' @param pool_threshold Only used when \code{MIBoost = FALSE} and \code{pool = TRUE}.
#' Controls the pooling rule when aggregating the \eqn{M} models obtained from
#' the imputed datasets into a single final model. A candidate variable is
#' included only if it is selected in at least \code{pool_threshold} (a value
#' in (0, 1]) proportion of the imputed datasets; coefficients of all other
#' variables are set to zero. A value of \code{0} corresponds to
#' estimate-averaging, while values \code{> 0} correspond to
#' selection-frequency thresholding.
#' @param show_progress Logical; print fold-level progress and summary timings.
#' Default \code{TRUE}.
#' @param center One of \code{c("auto", "off", "force")}. Controls centering of
#' \code{X}.
#' With \code{"auto"} (recommended), centering is applied only if the training
#' data are not already centered (checked across imputations). With
#' \code{"force"}, centering is always applied. With \code{"off"}, centering
#' is skipped.
#'
#' If centering is applied, a single grand mean vector \eqn{\mu_\star} is
#' computed from the training imputations in the corresponding fold and then
#' subtracted from \emph{all} imputed training and validation matrices in that
#' fold (and analogously for the final model fit on \code{X_full}).
#'
#' @details
#' Centering affects only \code{X}; \code{y} is left unchanged. For
#' \code{type = "logistic"}, responses are treated as numeric \code{0/1}
#' via the logistic link. Validation loss is averaged over
#' imputations and then over folds.
#'
#' @return A list with:
#' \itemize{
#' \item \code{CV_error}: numeric vector of length \code{mstop} with the mean
#' cross-validated loss across folds (and imputations).
#' \item \code{best_mstop}: integer index of the minimizing entry in
#' \code{CV_error}.
#' \item \code{final_model}: numeric vector of length \code{1 + p} containing
#' the intercept followed by \eqn{p} coefficients of the final pooled
#' model fitted at \code{best_mstop} on \code{X_full}/\code{y_full}.
#' \item \code{center_means}: (optional) numeric vector of length \eqn{p}
#' containing the centering means used for \code{X} (when available).
#' }
#'
#' @seealso \code{\link{impu_boost}}, \code{\link{cv_boost_raw}}
#'
#' @examplesIf requireNamespace("mice", quietly = TRUE) && requireNamespace("miceadds", quietly = TRUE)
#' \donttest{
#' set.seed(123)
#' utils::data(booami_sim)
#' k <- 2; M <- 2
#'
#' # Separate X and y; drop missing y (policy)
#' X_all <- booami_sim[, 1:25, drop = FALSE]
#' y_all <- booami_sim[, 26]
#' keep <- !is.na(y_all)
#' X_all <- X_all[keep, , drop = FALSE]
#' y_all <- y_all[keep]
#'
#' n <- nrow(X_all); p <- ncol(X_all)
#' folds <- sample(rep(seq_len(k), length.out = n))
#'
#' X_train_list <- vector("list", k)
#' y_train_list <- vector("list", k)
#' X_val_list <- vector("list", k)
#' y_val_list <- vector("list", k)
#'
#' for (cv in seq_len(k)) {
#' tr <- folds != cv
#' va <- !tr
#' Xtr <- X_all[tr, , drop = FALSE]; ytr <- y_all[tr]
#' Xva <- X_all[va, , drop = FALSE]; yva <- y_all[va]
#'
#' # Impute X only (y is never used for imputation)
#' pm_tr <- mice::quickpred(Xtr, method = "spearman", mincor = 0.30, minpuc = 0.60)
#' imp_tr <- mice::mice(Xtr, m = M, predictorMatrix = pm_tr, maxit = 1, printFlag = FALSE)
#' imp_va <- mice::mice.mids(imp_tr, newdata = Xva, maxit = 1, printFlag = FALSE)
#'
#' X_train_list[[cv]] <- vector("list", M)
#' y_train_list[[cv]] <- vector("list", M)
#' X_val_list[[cv]] <- vector("list", M)
#' y_val_list[[cv]] <- vector("list", M)
#'
#' for (m in seq_len(M)) {
#' tr_m <- mice::complete(imp_tr, m)
#' va_m <- mice::complete(imp_va, m)
#' X_train_list[[cv]][[m]] <- data.matrix(tr_m)
#' y_train_list[[cv]][[m]] <- ytr
#' X_val_list[[cv]][[m]] <- data.matrix(va_m)
#' y_val_list[[cv]][[m]] <- yva
#' }
#' }
#'
#' # Full-data imputations (X only)
#' pm_full <- mice::quickpred(X_all, method = "spearman", mincor = 0.30, minpuc = 0.60)
#' imp_full <- mice::mice(X_all, m = M, predictorMatrix = pm_full, maxit = 1, printFlag = FALSE)
#' X_full <- lapply(seq_len(M), function(m) data.matrix(mice::complete(imp_full, m)))
#' y_full <- lapply(seq_len(M), function(m) y_all)
#'
#' res <- cv_boost_imputed(
#' X_train_list, y_train_list,
#' X_val_list, y_val_list,
#' X_full, y_full,
#' ny = 0.1, mstop = 50, type = "gaussian",
#' MIBoost = TRUE, pool = TRUE, center = "auto",
#' show_progress = FALSE
#' )
#' \dontshow{invisible(utils::capture.output(str(res)))}
#' }
#'
#' \donttest{
#' set.seed(2025)
#' utils::data(booami_sim)
#' k <- 5; M <- 10
#'
#' X_all <- booami_sim[, 1:25, drop = FALSE]
#' y_all <- booami_sim[, 26]
#' keep <- !is.na(y_all)
#' X_all <- X_all[keep, , drop = FALSE]
#' y_all <- y_all[keep]
#'
#' n <- nrow(X_all); p <- ncol(X_all)
#' folds <- sample(rep(seq_len(k), length.out = n))
#'
#' X_train_list <- vector("list", k)
#' y_train_list <- vector("list", k)
#' X_val_list <- vector("list", k)
#' y_val_list <- vector("list", k)
#'
#' for (cv in seq_len(k)) {
#' tr <- folds != cv; va <- !tr
#' Xtr <- X_all[tr, , drop = FALSE]; ytr <- y_all[tr]
#' Xva <- X_all[va, , drop = FALSE]; yva <- y_all[va]
#'
#' pm_tr <- mice::quickpred(Xtr, method = "spearman", mincor = 0.20, minpuc = 0.40)
#' imp_tr <- mice::mice(Xtr, m = M, predictorMatrix = pm_tr, maxit = 5, printFlag = TRUE)
#' imp_va <- mice::mice.mids(imp_tr, newdata = Xva, maxit = 1, printFlag = FALSE)
#'
#' X_train_list[[cv]] <- vector("list", M)
#' y_train_list[[cv]] <- vector("list", M)
#' X_val_list[[cv]] <- vector("list", M)
#' y_val_list[[cv]] <- vector("list", M)
#'
#' for (m in seq_len(M)) {
#' tr_m <- mice::complete(imp_tr, m)
#' va_m <- mice::complete(imp_va, m)
#' X_train_list[[cv]][[m]] <- data.matrix(tr_m)
#' y_train_list[[cv]][[m]] <- ytr
#' X_val_list[[cv]][[m]] <- data.matrix(va_m)
#' y_val_list[[cv]][[m]] <- yva
#' }
#' }
#'
#' pm_full <- mice::quickpred(X_all, method = "spearman", mincor = 0.20, minpuc = 0.40)
#' imp_full <- mice::mice(X_all, m = M, predictorMatrix = pm_full, maxit = 5, printFlag = TRUE)
#' X_full <- lapply(seq_len(M), function(m) data.matrix(mice::complete(imp_full, m)))
#' y_full <- lapply(seq_len(M), function(m) y_all)
#'
#' res_heavy <- cv_boost_imputed(
#' X_train_list, y_train_list,
#' X_val_list, y_val_list,
#' X_full, y_full,
#' ny = 0.1, mstop = 250, type = "gaussian",
#' MIBoost = TRUE, pool = TRUE, center = "auto",
#' show_progress = TRUE
#' )
#' str(res_heavy)
#' }
#' @export
cv_boost_imputed <- function(
X_train_list, y_train_list,
X_val_list, y_val_list,
X_full, y_full,
ny = 0.1, mstop = 250, type = c("gaussian","logistic"),
MIBoost = TRUE, pool = TRUE, pool_threshold = 0,
show_progress = TRUE,
center = c("auto","off","force")
) {
# match args (keep same order/choices as core)
type <- match.arg(type)
center <- match.arg(center)
# ---- light validation (fold-level) ----
K <- length(y_train_list)
stopifnot(
length(X_train_list) == K,
length(X_val_list) == K,
length(y_val_list) == K,
K >= 2L
)
if (!is.numeric(ny) || length(ny) != 1L || !is.finite(ny) || ny <= 0) {
stop("`ny` must be a single positive numeric.")
}
if (!is.numeric(mstop) || length(mstop) != 1L || !is.finite(mstop) || mstop < 1) {
stop("`mstop` must be a single positive integer.")
}
mstop <- as.integer(mstop)
# ensure each fold has same number of imputations M and basic shapes look sane
Ms <- integer(K)
for (cv in seq_len(K)) {
Ms[cv] <- length(X_train_list[[cv]])
if (length(y_train_list[[cv]]) != Ms[cv] ||
length(X_val_list[[cv]]) != Ms[cv] ||
length(y_val_list[[cv]]) != Ms[cv]) {
stop(sprintf("Fold %d: inconsistent number of imputations across X/y train/val.", cv))
}
}
if (length(unique(Ms)) != 1L) stop("All folds must have the same number of imputations M.")
M <- Ms[1L]
# full data lists should also have M imputations
if (length(X_full) != M || length(y_full) != M) {
stop("`X_full` and `y_full` must each be lists of length M (same M as the folds).")
}
# basic logistic target check (silent coercion not performed)
if (type == "logistic") {
ok01 <- function(v) all(is.finite(v)) && all(v %in% c(0,1))
if (!ok01(y_train_list[[1]][[1]]) || !ok01(y_val_list[[1]][[1]]) || !ok01(y_full[[1]])) {
stop("For type='logistic', outcomes must be 0/1 (no NAs, no other values).")
}
}
# ---- delegate to the core ----
res <- .cv_boost_core(
X_train_list = X_train_list, y_train_list = y_train_list,
X_val_list = X_val_list, y_val_list = y_val_list,
X = X_full, y = y_full,
ny = ny, mstop = mstop, type = type,
MIBoost = MIBoost, pool = pool, pool_threshold = pool_threshold,
show_progress = show_progress,
center = center
)
# ---- friendly metadata & names for prediction -------------------
# 1) Annotate family type for booami_predict() inference
if (is.null(res$type)) res$type <- type
# 2) Bubble up center_means if the core exposed them (preferred)
# a) directly on the returned object
# b) or nested in a `final_fit` list (if the core returns the impu_boost fit)
if (is.null(res$center_means)) {
if (!is.null(res$final_fit) && !is.null(res$final_fit$center_means)) {
res$center_means <- res$final_fit$center_means
}
}
# (No attempt here to "recompute" means from X_full - we only propagate what the core actually used.)
# 3) Name final_model coefficients for name-based alignment in prediction
if (!is.null(res$final_model)) {
pred_names <- colnames(X_full[[1L]])
if (!is.null(pred_names) &&
length(res$final_model) == (length(pred_names) + 1L) &&
is.null(names(res$final_model))) {
names(res$final_model) <- c("(Intercept)", pred_names)
}
}
class(res) <- c("booami_cv_imputed", "booami_cv", class(res))
res
}
#' Cross-Validated Component-Wise Gradient Boosting with
#' Multiple Imputation Performed Inside Each Fold
#'
#' Performs k-fold cross-validation for \code{\link{impu_boost}} on data with
#' missing values. Within each fold, multiple imputation, centering, model
#' fitting, and validation are performed in a leakage-avoiding manner to select
#' the optimal number of boosting iterations (\code{mstop}). The final model is
#' then fitted on multiple imputations of the full dataset at the selected
#' stopping iteration.
#'
#' @details
#' Rows with missing outcomes \code{y} are removed before fold assignment.
#' Within each CV fold, the remaining data are first split into a training subset
#' and a validation subset. Multiple imputation is then performed on the
#' covariates \code{X} only (the outcome is never imputed and is not used as a
#' predictor in the imputation models). The training covariates are multiply
#' imputed \eqn{M} times using \pkg{mice}, producing \eqn{M} imputed training
#' datasets. The corresponding validation covariates are then imputed \eqn{M}
#' times using the imputation models learned from the training data (leakage-avoiding).
#'
#' If centering is applied, a single grand mean vector \eqn{\mu_\star} is
#' computed from the imputed training covariates in the corresponding fold and
#' subtracted from \emph{all} imputed training and validation covariate matrices
#' in that fold.
#'
#' \code{\link{impu_boost}} is run on the imputed training datasets for up to
#' \code{mstop} boosting iterations. At each iteration, prediction errors are
#' computed on the corresponding validation datasets and averaged across
#' imputations. This yields an aggregated error curve per fold, which is then
#' averaged across folds. The optimal stopping iteration is chosen as the
#' \code{mstop} value minimizing the mean CV error.
#'
#' Finally, the full covariate matrix \code{X} is multiply imputed \eqn{M} times.
#' If centering is applied, it uses a grand mean \eqn{\mu_\star} computed across
#' the \eqn{M} full-data imputations. \code{\link{impu_boost}} is applied to these
#' datasets for the selected number of boosting iterations to obtain the final model.
#'
#' \strong{Imputation control.} All key \pkg{mice} settings can be passed via
#' \code{impute_args} (a named list forwarded to \code{mice::mice()}) and/or
#' \code{impute_method} (a named character vector of per-variable methods).
#' Internally, the function builds a full default method vector from the actual
#' data given to \code{mice()}, then \emph{merges} any user-supplied entries
#' by name. \emph{The names in \code{impute_method} must exactly match the
#' column names in \code{X} (i.e., the data passed to \code{mice()}).} Partial
#' vectors are allowed; variables not listed fall back to defaults; unknown names
#' are ignored with a warning. The function sets and may override \code{data},
#' \code{method} (after merging overrides), \code{predictorMatrix}, and
#' \code{ignore} (to enforce train-only learning). Predictor matrices can be built
#' with \code{mice::quickpred()} (see \code{use_quickpred}, \code{quickpred_args})
#' or with \code{mice::make.predictorMatrix()}.
#'
#' @references
#' Kuchen, R. (2025). \emph{MIBoost: A Gradient Boosting Algorithm for Variable
#' Selection After Multiple Imputation}. arXiv:2507.21807.
#' \doi{10.48550/arXiv.2507.21807} \url{https://arxiv.org/abs/2507.21807}.
#'
#' @param X A data.frame or matrix of predictors of size \eqn{n \times p}
#' containing missing values. Column names are preserved. Rows with missing
#' outcomes \code{y} are removed before CV. If no missing values are present in
#' \code{X} after removing missing-\code{y} rows, use
#' \code{\link{cv_boost_imputed}} instead.
#' @param y A vector of length \eqn{n} with the outcome (numeric for
#' \code{type = "gaussian"}; numeric \code{0/1} or a 2-level factor for
#' \code{type = "logistic"}). Must align with \code{X} rows. Rows with missing
#' \code{y} are removed before CV. The outcome is never imputed and is not used
#' as a predictor in imputation models.
#' @param k Number of cross-validation folds. Default is \code{5}.
#' @param ny Learning rate. Defaults to \code{0.1}.
#' @param mstop Maximum number of boosting iterations to evaluate during
#' cross-validation. The selected \code{mstop} is the value minimizing
#' the mean CV error over \code{1:mstop}. Default is \code{250}.
#' @param type Type of loss function. One of:
#' \code{"gaussian"} (mean squared error) for continuous responses,
#' or \code{"logistic"} (binomial deviance) for binary responses.
#' @param MIBoost Logical. If \code{TRUE}, applies the MIBoost algorithm,
#' which enforces uniform variable selection across all imputed datasets. If
#' \code{FALSE}, variables are selected independently within each imputed
#' dataset, and pooling is governed by \code{pool_threshold}.
#' @param pool Logical. If \code{TRUE}, models across the \eqn{M} imputed
#' datasets are aggregated into a single final model. If \code{FALSE},
#' \eqn{M} separate models are returned.
#' @param pool_threshold Only used when \code{MIBoost = FALSE} and
#' \code{pool = TRUE}. Controls the pooling rule when aggregating the
#' \eqn{M} models obtained from the imputed datasets into a single final model.
#' A candidate variable is included only if it is selected in at least
#' \code{pool_threshold} (a value in (0, 1) proportion of the imputed
#' datasets; coefficients of all other variables are set to zero. A value of
#' \code{0} corresponds to estimate-averaging, while values \code{> 0}
#' correspond to selection-frequency thresholding.
#' @param impute_args A named list of arguments forwarded to \code{mice::mice()}
#' both inside CV and on the full dataset (e.g., \code{m}, \code{maxit},
#' \code{seed}, \code{printFlag}, etc.). Internally, \code{data},
#' \code{predictorMatrix}, and \code{ignore} are set by the function and will
#' override any values supplied here. If \code{m} is missing, a default of
#' \code{10} is used.
#' @param impute_method Optional \emph{named} character vector passed to
#' \code{mice::mice(method = ...)} to control per-variable methods for
#' covariates \code{X} (e.g., \code{"pmm"}, \code{"logreg"}). This may be a
#' \emph{partial} vector: entries are merged by name into a full default method
#' vector derived from \code{X}; unmatched names are ignored with a warning.
#' If \code{NULL} (default), numeric covariates use \code{"pmm"}.
#' @param use_quickpred Logical. If \code{TRUE} (default), build the
#' \code{predictorMatrix} via \code{mice::quickpred()} on the training covariates
#' within each fold; otherwise use \code{mice::make.predictorMatrix()}.
#' @param quickpred_args A named list of arguments forwarded to
#' \code{mice::quickpred()} (e.g., \code{mincor}, \code{minpuc},
#' \code{method}, \code{include}, \code{exclude}). Ignored when
#' \code{use_quickpred = FALSE}.
#' @param seed Base random seed for fold assignment. If \code{impute_args$seed}
#' is \emph{not} supplied, this value also seeds imputation; otherwise the
#' user-specified \code{impute_args$seed} is respected and deterministically
#' offset per fold. RNG state is restored on exit. Default \code{123}.
#' @param show_progress Logical. If \code{TRUE} (default), print progress for
#' the imputation and boosting phases, plus a summary at completion.
#' @param return_full_imputations Logical. If \code{TRUE}, attach the list of
#' full-data imputations used for the final fit as
#' \code{$full_imputations = list(X = <list length m>, y = <list length m>)}.
#' Default is \code{FALSE}.
#' @param center One of \code{c("auto", "off", "force")}. Controls centering of
#' \code{X}. With \code{"auto"} (recommended), centering is applied only if the
#' training data are not already centered (checked across imputations). With
#' \code{"force"}, centering is always applied. With \code{"off"}, centering is
#' skipped.
#'
#' If centering is applied, a single grand mean vector \eqn{\mu_\star} is
#' computed from the imputed training covariates in the corresponding fold and
#' subtracted from all imputed training and validation matrices in that fold
#' (and analogously for the final model fit on the full-data imputations).
#'
#' @return A list with:
#' \itemize{
#' \item \code{CV_error}: numeric vector (length \code{mstop}) of mean CV loss.
#' \item \code{best_mstop}: integer index minimizing \code{CV_error}.
#' \item \code{final_model}: numeric vector of length \code{1 + p} with the
#' intercept and pooled coefficients of the final fit on full-data
#' imputations at \code{best_mstop}.
#' \item \code{full_imputations}: (optional) when \code{return_full_imputations=TRUE},
#' a list \code{list(X = <list length m>, y = <list length m>)} containing
#' the full-data imputations used for the final model.
#' \item \code{folds}: integer vector of length \eqn{n} giving the CV fold id
#' for each observation (1..k).
#' }
#'
#' @seealso \code{\link{impu_boost}}, \code{\link{cv_boost_imputed}}, \pkg{mice}
#'
#' @examplesIf requireNamespace("mice", quietly = TRUE) && requireNamespace("miceadds", quietly = TRUE)
#' \donttest{
#' utils::data(booami_sim)
#' X <- booami_sim[, 1:25]
#' y <- booami_sim[, 26]
#'
#' res <- cv_boost_raw(
#' X = X, y = y,
#' k = 2, seed = 123,
#' impute_args = list(m = 2, maxit = 1, printFlag = FALSE, seed = 1),
#' quickpred_args = list(mincor = 0.30, minpuc = 0.60),
#' mstop = 50,
#' show_progress = FALSE
#' )
#'
#' # Partial custom imputation method override (X variables only)
#' meth <- c(X1 = "pmm")
#' res2 <- cv_boost_raw(
#' X = X, y = y,
#' k = 2, seed = 123,
#' impute_args = list(m = 2, maxit = 1, printFlag = FALSE, seed = 456),
#' quickpred_args = list(mincor = 0.30, minpuc = 0.60),
#' mstop = 50,
#' impute_method = meth,
#' show_progress = FALSE
#' )
#' \dontshow{invisible(utils::capture.output(str(res2)))}
#' }
#'
#' @export
cv_boost_raw <- function(
X, y,
k = 5, ny = 0.1, mstop = 250, type = c("gaussian","logistic"),
MIBoost = TRUE, pool = TRUE, pool_threshold = 0,
impute_args = list(m = 10, maxit = 5, printFlag = FALSE),
impute_method = NULL,
use_quickpred = TRUE,
quickpred_args = list(mincor = 0.1, minpuc = 0.5, method = NULL, include = NULL, exclude = NULL),
seed = 123,
show_progress = TRUE,
return_full_imputations = FALSE,
center = "auto"
) {
type <- match.arg(type)
center <- match.arg(center, c("auto","off","force"))
if (!is.data.frame(X) && !is.matrix(X))
stop("X must be a data.frame or matrix.")
X <- as.data.frame(X, check.names = FALSE)
if (length(y) != nrow(X))
stop("Length of y must match nrow(X).")
# --- drop rows with missing y (so y is never imputed, and cannot be used) ---
keep <- !is.na(y)
if (!any(keep)) stop("All rows have missing y; nothing to fit.")
if (any(!keep) && isTRUE(show_progress)) {
cat(sprintf("Dropping %d rows with missing y...\n", sum(!keep)))
}
X <- X[keep, , drop = FALSE]
y <- y[keep]
# after dropping missing y, ensure y is finite
if (any(!is.finite(as.numeric(y)))) stop("Non-finite values in y after dropping NAs.")
# logistic guard: must be 0/1 with no NAs
if (type == "logistic") {
y_num <- as.numeric(y)
if (!all(y_num %in% c(0, 1))) {
# also allow factor levels "0"/"1" etc.
if (is.factor(y) || is.character(y)) {
y_num2 <- suppressWarnings(as.numeric(as.character(y)))
if (!all(is.finite(y_num2)) || !all(y_num2 %in% c(0, 1))) {
stop("For type='logistic', y must be coded 0/1 (no NAs).")
}
y <- y_num2
} else {
stop("For type='logistic', y must be coded 0/1 (no NAs).")
}
} else {
y <- y_num
}
} else {
y <- as.numeric(y)
}
# Now we only care whether X has missingness
if (!anyNA(X)) {
stop("After removing missing y, X has no missing values. Use cv_boost_imputed() (or a complete-data variant).")
}
if (!("ignore" %in% names(formals(mice::mice))))
stop("Update 'mice': need 'ignore' in mice().")
n <- nrow(X)
# --- folds: reproducible WITHOUT altering the user's RNG state
folds_id <- withr::with_preserve_seed({
seq_rep <- rep(seq_len(k), length.out = n)
if (is.null(seed)) {
sample(seq_rep)
} else {
withr::with_seed(seed, sample(seq_rep))
}
})
if (!is.logical(pool) || length(pool) != 1L) {
stop("'pool' must be a single TRUE/FALSE.")
}
seed_base_mice <- if (!is.null(impute_args$seed)) impute_args$seed else seed
get_m <- function() if (!is.null(impute_args$m)) impute_args$m else 10L
# methods builder (NO y column here)
build_methods_X <- function(datX) {
meth <- mice::make.method(datX)
is_num <- vapply(datX, is.numeric, logical(1L))
meth[is_num] <- "pmm"
# allow user override for X columns
if (!is.null(impute_method)) {
nm <- intersect(names(impute_method), names(meth))
if (length(nm) == 0L) {
warning("None of the names in 'impute_method' matched the X columns; overrides ignored.")
} else {
meth[nm] <- impute_method[nm]
}
}
meth
}
m <- get_m()
if (isTRUE(show_progress)) cat(sprintf("MI-in-CV (no y in imputation): %d folds x %d imputations...\n", k, m))
X_train_list <- vector("list", k); y_train_list <- vector("list", k)
X_val_list <- vector("list", k); y_val_list <- vector("list", k)
for (i in seq_len(k)) {
tr_idx <- folds_id != i; va_idx <- !tr_idx
Xtr <- X[tr_idx, , drop = FALSE]; ytr <- y[tr_idx]
Xva <- X[va_idx, , drop = FALSE]; yva <- y[va_idx]
# IMPORTANT: impute only X (no y column at all)
dat_tr <- Xtr
dat_va <- Xva
dat_all <- rbind(dat_tr, dat_va)
ntr <- nrow(dat_tr)
if (isTRUE(use_quickpred)) {
qp <- quickpred_args
qp <- qp[!vapply(qp, is.null, logical(1))]
pm <- do.call(mice::quickpred, c(list(data = dat_tr), qp))
} else {
pm <- mice::make.predictorMatrix(dat_tr)
}
meth <- build_methods_X(dat_all)
ignore_vec <- c(rep(FALSE, ntr), rep(TRUE, nrow(dat_va)))
args <- utils::modifyList(impute_args, list(
data = dat_all,
method = meth,
predictorMatrix = pm,
ignore = ignore_vec
), keep.null = TRUE)
if (is.null(args$m)) args$m <- m
if (is.null(args$seed)) {
args$seed <- seed_base_mice + i
} else {
args$seed <- args$seed + i
}
if (isTRUE(show_progress)) cat(sprintf("Fold %d: mice on X only (m=%d)...\n", i, args$m))
mids <- do.call(mice::mice, args)
X_train_list[[i]] <- vector("list", args$m)
y_train_list[[i]] <- vector("list", args$m)
X_val_list[[i]] <- vector("list", args$m)
y_val_list[[i]] <- vector("list", args$m)
for (j in seq_len(args$m)) {
comp <- mice::complete(mids, action = j)
comp_tr <- comp[seq_len(ntr), , drop = FALSE]
comp_va <- comp[(ntr + 1):nrow(comp), , drop = FALSE]
X_train_list[[i]][[j]] <- data.matrix(comp_tr)
X_val_list[[i]][[j]] <- data.matrix(comp_va)
# y is NOT imputed; just replicate observed y per imputation
y_train_list[[i]][[j]] <- ytr
y_val_list[[i]][[j]] <- yva
}
}
# --- Full-data imputations (again: X only, no y in mice) ---
if (isTRUE(show_progress)) cat("Full-data imputations (X only)...\n")
dat_full <- X
if (isTRUE(use_quickpred)) {
qp <- quickpred_args
qp <- qp[!vapply(qp, is.null, logical(1))]
pm_full <- do.call(mice::quickpred, c(list(data = dat_full), qp))
} else {
pm_full <- mice::make.predictorMatrix(dat_full)
}
meth_full <- build_methods_X(dat_full)
args_full <- utils::modifyList(impute_args, list(
data = dat_full,
method = meth_full,
predictorMatrix = pm_full
), keep.null = TRUE)
if (is.null(args_full$m)) args_full$m <- m
if (is.null(args_full$seed)) {
args_full$seed <- seed_base_mice + 10^6
} else {
args_full$seed <- args_full$seed + 10^6
}
mids_full <- do.call(mice::mice, args_full)
X_list <- vector("list", args_full$m)
y_list <- vector("list", args_full$m)
for (j in seq_len(args_full$m)) {
comp_full <- mice::complete(mids_full, j)
X_list[[j]] <- data.matrix(comp_full)
y_list[[j]] <- y # replicated observed y
}
if (isTRUE(show_progress))
cat(sprintf("Boosting: k=%d, m=%d; mstop=%d...\n", k, m, mstop))
res <- .cv_boost_core(
X_train_list, y_train_list,
X_val_list, y_val_list,
X_list, y_list,
ny, mstop, type, MIBoost, pool, pool_threshold,
show_progress = show_progress,
center = center
)
class(res) <- c("booami_cv_raw", "booami_cv", class(res))
if (is.null(res$type)) res$type <- type
if (is.null(res$center_means) && !is.null(res$final_fit) && !is.null(res$final_fit$center_means)) {
res$center_means <- res$final_fit$center_means
}
if (is.null(res$center_means_list) && !is.null(res$final_fit) && !is.null(res$final_fit$center_means_list)) {
res$center_means_list <- res$final_fit$center_means_list
}
pred_names <- colnames(X_list[[1L]])
if (!is.null(res$final_model)) {
if (!is.null(pred_names) &&
length(res$final_model) == (length(pred_names) + 1L) &&
is.null(names(res$final_model))) {
names(res$final_model) <- c("(Intercept)", pred_names)
}
} else if (!is.null(res$final_models) && length(res$final_models)) {
for (m_i in seq_along(res$final_models)) {
fm <- res$final_models[[m_i]]
if (!is.null(pred_names) &&
length(fm) == (length(pred_names) + 1L) &&
is.null(names(fm))) {
names(fm) <- c("(Intercept)", pred_names)
res$final_models[[m_i]] <- fm
}
}
}
if (isTRUE(return_full_imputations)) {
res$full_imputations <- list(X = X_list, y = y_list)
}
res$folds <- folds_id
res
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.