| cv_boost_imputed | R Documentation |
To avoid data leakage, each CV fold should first be split into training and validation subsets, after which imputation is performed. For the final model, all data should be imputed independently.
cv_boost_imputed(
X_train_list,
y_train_list,
X_val_list,
y_val_list,
X_full,
y_full,
ny = 0.1,
mstop = 250,
type = c("gaussian", "logistic"),
MIBoost = TRUE,
pool = TRUE,
pool_threshold = 0,
show_progress = TRUE,
center = c("auto", "off", "force")
)
X_train_list |
A list of length |
y_train_list |
A list of length |
X_val_list |
A list of length |
y_val_list |
A list of length |
X_full |
A list of length |
y_full |
A list of length |
ny |
Learning rate. Defaults to |
mstop |
Maximum number of boosting iterations to evaluate during
cross-validation. The selected |
type |
Type of loss function. One of:
|
MIBoost |
Logical. If |
pool |
Logical. If |
pool_threshold |
Only used when |
show_progress |
Logical; print fold-level progress and summary timings.
Default |
center |
One of If centering is applied, a single grand mean vector |
The recommended workflow is illustrated in the examples.
Centering affects only X; y is left unchanged. For
type = "logistic", responses are treated as numeric 0/1
via the logistic link. Validation loss is averaged over
imputations and then over folds.
A list with:
CV_error: numeric vector of length mstop with the mean
cross-validated loss across folds (and imputations).
best_mstop: integer index of the minimizing entry in
CV_error.
final_model: numeric vector of length 1 + p containing
the intercept followed by p coefficients of the final pooled
model fitted at best_mstop on X_full/y_full.
center_means: (optional) numeric vector of length p
containing the centering means used for X (when available).
Kuchen, R. (2025). MIBoost: A Gradient Boosting Algorithm for Variable Selection After Multiple Imputation. arXiv:2507.21807. \Sexpr[results=rd]{tools:::Rd_expr_doi("10.48550/arXiv.2507.21807")} https://arxiv.org/abs/2507.21807.
impu_boost, cv_boost_raw
set.seed(123)
utils::data(booami_sim)
k <- 2; M <- 2
# Separate X and y; drop missing y (policy)
X_all <- booami_sim[, 1:25, drop = FALSE]
y_all <- booami_sim[, 26]
keep <- !is.na(y_all)
X_all <- X_all[keep, , drop = FALSE]
y_all <- y_all[keep]
n <- nrow(X_all); p <- ncol(X_all)
folds <- sample(rep(seq_len(k), length.out = n))
X_train_list <- vector("list", k)
y_train_list <- vector("list", k)
X_val_list <- vector("list", k)
y_val_list <- vector("list", k)
for (cv in seq_len(k)) {
tr <- folds != cv
va <- !tr
Xtr <- X_all[tr, , drop = FALSE]; ytr <- y_all[tr]
Xva <- X_all[va, , drop = FALSE]; yva <- y_all[va]
# Impute X only (y is never used for imputation)
pm_tr <- mice::quickpred(Xtr, method = "spearman", mincor = 0.30, minpuc = 0.60)
imp_tr <- mice::mice(Xtr, m = M, predictorMatrix = pm_tr, maxit = 1, printFlag = FALSE)
imp_va <- mice::mice.mids(imp_tr, newdata = Xva, maxit = 1, printFlag = FALSE)
X_train_list[[cv]] <- vector("list", M)
y_train_list[[cv]] <- vector("list", M)
X_val_list[[cv]] <- vector("list", M)
y_val_list[[cv]] <- vector("list", M)
for (m in seq_len(M)) {
tr_m <- mice::complete(imp_tr, m)
va_m <- mice::complete(imp_va, m)
X_train_list[[cv]][[m]] <- data.matrix(tr_m)
y_train_list[[cv]][[m]] <- ytr
X_val_list[[cv]][[m]] <- data.matrix(va_m)
y_val_list[[cv]][[m]] <- yva
}
}
# Full-data imputations (X only)
pm_full <- mice::quickpred(X_all, method = "spearman", mincor = 0.30, minpuc = 0.60)
imp_full <- mice::mice(X_all, m = M, predictorMatrix = pm_full, maxit = 1, printFlag = FALSE)
X_full <- lapply(seq_len(M), function(m) data.matrix(mice::complete(imp_full, m)))
y_full <- lapply(seq_len(M), function(m) y_all)
res <- cv_boost_imputed(
X_train_list, y_train_list,
X_val_list, y_val_list,
X_full, y_full,
ny = 0.1, mstop = 50, type = "gaussian",
MIBoost = TRUE, pool = TRUE, center = "auto",
show_progress = FALSE
)
set.seed(2025)
utils::data(booami_sim)
k <- 5; M <- 10
X_all <- booami_sim[, 1:25, drop = FALSE]
y_all <- booami_sim[, 26]
keep <- !is.na(y_all)
X_all <- X_all[keep, , drop = FALSE]
y_all <- y_all[keep]
n <- nrow(X_all); p <- ncol(X_all)
folds <- sample(rep(seq_len(k), length.out = n))
X_train_list <- vector("list", k)
y_train_list <- vector("list", k)
X_val_list <- vector("list", k)
y_val_list <- vector("list", k)
for (cv in seq_len(k)) {
tr <- folds != cv; va <- !tr
Xtr <- X_all[tr, , drop = FALSE]; ytr <- y_all[tr]
Xva <- X_all[va, , drop = FALSE]; yva <- y_all[va]
pm_tr <- mice::quickpred(Xtr, method = "spearman", mincor = 0.20, minpuc = 0.40)
imp_tr <- mice::mice(Xtr, m = M, predictorMatrix = pm_tr, maxit = 5, printFlag = TRUE)
imp_va <- mice::mice.mids(imp_tr, newdata = Xva, maxit = 1, printFlag = FALSE)
X_train_list[[cv]] <- vector("list", M)
y_train_list[[cv]] <- vector("list", M)
X_val_list[[cv]] <- vector("list", M)
y_val_list[[cv]] <- vector("list", M)
for (m in seq_len(M)) {
tr_m <- mice::complete(imp_tr, m)
va_m <- mice::complete(imp_va, m)
X_train_list[[cv]][[m]] <- data.matrix(tr_m)
y_train_list[[cv]][[m]] <- ytr
X_val_list[[cv]][[m]] <- data.matrix(va_m)
y_val_list[[cv]][[m]] <- yva
}
}
pm_full <- mice::quickpred(X_all, method = "spearman", mincor = 0.20, minpuc = 0.40)
imp_full <- mice::mice(X_all, m = M, predictorMatrix = pm_full, maxit = 5, printFlag = TRUE)
X_full <- lapply(seq_len(M), function(m) data.matrix(mice::complete(imp_full, m)))
y_full <- lapply(seq_len(M), function(m) y_all)
res_heavy <- cv_boost_imputed(
X_train_list, y_train_list,
X_val_list, y_val_list,
X_full, y_full,
ny = 0.1, mstop = 250, type = "gaussian",
MIBoost = TRUE, pool = TRUE, center = "auto",
show_progress = TRUE
)
str(res_heavy)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.