R/run_real_data_study.R
In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data

Documented in run_real_data_study

#' Run a real-data study
#'
#' @param dataset A packaged dataset name or `misspls_dataset` object.
#' @param seed Optional base random seed.
#' @param missing_props Missing-data proportions as fractions or percentages.
#' @param mechanisms Missing-data mechanisms.
#' @param reps Number of replicate missingness draws for each mechanism and
#'   proportion.
#' @param baseline_reps Number of repeated complete-data `Q2-10fold` selections
#'   used to determine `t**`.
#' @param max_ncomp Maximum number of extracted components.
#' @param criteria Criteria evaluated on incomplete and imputed data.
#' @param incomplete_methods Incomplete-data NIPALS workflows.
#' @param imputation_methods Imputation methods.
#' @param folds Number of folds used by `"Q2-10fold"`.
#' @param mar_y_bias MAR bias parameter passed to [add_missingness()].
#'
#' @return A data frame with one row per study run.
#' @export
run_real_data_study <- function(
    dataset,
    seed = NULL,
    missing_props = seq(5, 50, 5),
    mechanisms = c("MCAR", "MAR"),
    reps = 1L,
    baseline_reps = 100L,
    max_ncomp = 12L,
    criteria = c("Q2-LOO", "Q2-10fold", "AIC", "AIC-DoF", "BIC", "BIC-DoF"),
    incomplete_methods = c("nipals_standard", "nipals_adaptative"),
    imputation_methods = c("mice", "knn", "svd"),
    folds = 10L,
    mar_y_bias = 0.8) {
  dataset <- .coerce_dataset(dataset)
  seed <- .normalize_seed(seed, allow_null = TRUE)
  reps <- as.integer(reps)
  baseline_reps <- as.integer(baseline_reps)

  rows <- vector("list", 0L)
  row_id <- 0L

  baseline_selections <- vector("list", baseline_reps)
  for (i in seq_len(baseline_reps)) {
    selection_seed <- .seed_with_offset(seed, i)
    timing <- system.time(
      baseline_selections[[i]] <- select_ncomp(
        x = dataset$x,
        y = dataset$y,
        method = "complete",
        criterion = "Q2-10fold",
        max_ncomp = max_ncomp,
        seed = selection_seed,
        folds = folds
      )
    )
    row_id <- row_id + 1L
    rows[[row_id]] <- .study_row(
      study = "real_data",
      dataset = dataset$name,
      n = nrow(dataset$x),
      p = ncol(dataset$x),
      true_ncomp = NA_integer_,
      target_ncomp = baseline_selections[[i]]$selected_ncomp,
      mechanism = NA_character_,
      missing_prop = 0,
      method = "Complete",
      criterion = "Q2-10fold",
      selected_ncomp = baseline_selections[[i]]$selected_ncomp,
      matched_target = NA,
      runtime_sec = unname(timing["elapsed"]),
      seed = selection_seed,
      status = baseline_selections[[i]]$status,
      notes = baseline_selections[[i]]$notes
    )
  }

  target_ncomp <- .mode_integer(vapply(baseline_selections, function(x) x$selected_ncomp, integer(1L)))

  for (mechanism in mechanisms) {
    for (missing_prop in missing_props) {
      for (rep_id in seq_len(reps)) {
        miss_seed <- .seed_with_offset(seed, 1000L * rep_id + match(mechanism, mechanisms) * 100L)
        miss <- add_missingness(
          x = dataset$x,
          y = dataset$y,
          mechanism = mechanism,
          missing_prop = missing_prop,
          seed = miss_seed,
          mar_y_bias = mar_y_bias
        )

        for (selection_method in incomplete_methods) {
          for (criterion in criteria) {
            selection_seed <- .seed_with_offset(miss_seed, row_id + 11L)
            timing <- system.time(
              sel <- select_ncomp(
                x = miss$x_incomplete,
                y = dataset$y,
                method = selection_method,
                criterion = criterion,
                max_ncomp = max_ncomp,
                seed = selection_seed,
                folds = folds
              )
            )
            row_id <- row_id + 1L
            rows[[row_id]] <- .study_row(
              study = "real_data",
              dataset = dataset$name,
              n = nrow(dataset$x),
              p = ncol(dataset$x),
              true_ncomp = NA_integer_,
              target_ncomp = target_ncomp,
              mechanism = mechanism,
              missing_prop = .normalize_missing_prop(missing_prop) * 100,
              method = .selection_method_label(selection_method),
              criterion = criterion,
              selected_ncomp = sel$selected_ncomp,
              matched_target = !is.na(sel$selected_ncomp) && sel$selected_ncomp == target_ncomp,
              runtime_sec = unname(timing["elapsed"]),
              seed = selection_seed,
              status = sel$status,
              notes = sel$notes
            )
          }
        }

        for (imp_method in imputation_methods) {
          imputation_seed <- .seed_with_offset(miss_seed, 50000L + row_id)
          imp <- tryCatch(
            impute_pls_data(
              x = miss$x_incomplete,
              method = imp_method,
              seed = imputation_seed,
              m = max(1L, as.integer(round(.normalize_missing_prop(missing_prop) * 100)))
            ),
            error = function(e) e
          )

          for (criterion in criteria) {
            if (inherits(imp, "error")) {
              sel <- .selection_result(
                selection_method = paste0("complete_", imp_method),
                criterion = criterion,
                selected_ncomp = NA_integer_,
                criterion_value = NA_real_,
                max_ncomp = max_ncomp,
                seed = imputation_seed,
                status = "error",
                notes = conditionMessage(imp)
              )
              elapsed <- NA_real_
              selection_seed <- imputation_seed
            } else {
              selection_seed <- .seed_with_offset(imputation_seed, match(criterion, criteria))
              timing <- system.time(
                sel <- select_ncomp(
                  x = imp,
                  y = dataset$y,
                  method = "complete",
                  criterion = criterion,
                  max_ncomp = max_ncomp,
                  seed = selection_seed,
                  folds = folds
                )
              )
              elapsed <- unname(timing["elapsed"])
            }

            row_id <- row_id + 1L
            rows[[row_id]] <- .study_row(
              study = "real_data",
              dataset = dataset$name,
              n = nrow(dataset$x),
              p = ncol(dataset$x),
              true_ncomp = NA_integer_,
              target_ncomp = target_ncomp,
              mechanism = mechanism,
              missing_prop = .normalize_missing_prop(missing_prop) * 100,
              method = .imputation_method_label(imp_method),
              criterion = criterion,
              selected_ncomp = sel$selected_ncomp,
              matched_target = !is.na(sel$selected_ncomp) && sel$selected_ncomp == target_ncomp,
              runtime_sec = elapsed,
              seed = selection_seed,
              status = sel$status,
              notes = sel$notes
            )
          }
        }
      }
    }
  }

  do.call(rbind, rows)
}

Any scripts or data that you put into this service are public.

missPLS documentation built on April 30, 2026, 9:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

missPLS
Methods and Reproducible Workflows for Partial Least Squares with Missing Data

R/run_real_data_study.R
In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data

Defines functions run_real_data_study

Documented in run_real_data_study

Try the missPLS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

missPLS Methods and Reproducible Workflows for Partial Least Squares with Missing Data

R/run_real_data_study.R In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data

Defines functions run_real_data_study

Documented in run_real_data_study

Try the missPLS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

missPLS
Methods and Reproducible Workflows for Partial Least Squares with Missing Data

R/run_real_data_study.R
In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data