R/run_simulation_study.R
In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data

Documented in run_simulation_study

#' Run a simulation study
#'
#' Run the simulation workflows used in the article and thesis.
#'
#' @param dimensions List of `(n, p)` integer pairs.
#' @param true_ncomp Vector of true component counts.
#' @param missing_props Missing-data proportions as fractions or percentages.
#' @param mechanisms Missing-data mechanisms.
#' @param reps Number of replicates.
#' @param seed Optional base random seed.
#' @param max_ncomp Maximum number of extracted components.
#' @param criteria Criteria evaluated on complete and imputed data.
#' @param incomplete_methods Incomplete-data NIPALS workflows.
#' @param imputation_methods Imputation methods.
#' @param folds Number of folds used by `"Q2-10fold"`.
#' @param mar_y_bias MAR bias parameter passed to [add_missingness()].
#'
#' @return A data frame with one row per study run.
#' @export
run_simulation_study <- function(
    dimensions = list(c(500L, 100L), c(500L, 20L), c(100L, 20L), c(80L, 25L),
                      c(60L, 33L), c(40L, 50L), c(20L, 100L)),
    true_ncomp = c(2L, 4L, 6L),
    missing_props = seq(5, 50, 5),
    mechanisms = c("MCAR", "MAR"),
    reps = 1L,
    seed = NULL,
    max_ncomp = 8L,
    criteria = c("Q2-LOO", "Q2-10fold", "AIC", "AIC-DoF", "BIC", "BIC-DoF"),
    incomplete_methods = c("nipals_standard", "nipals_adaptative"),
    imputation_methods = c("mice", "knn", "svd"),
    folds = 10L,
    mar_y_bias = 0.8) {
  seed <- .normalize_seed(seed, allow_null = TRUE)
  reps <- as.integer(reps)
  max_ncomp <- as.integer(max_ncomp)

  rows <- vector("list", 0L)
  row_id <- 0L
  counter <- 0L

  for (dim_pair in dimensions) {
    n <- as.integer(dim_pair[[1L]])
    p <- as.integer(dim_pair[[2L]])
    dataset_name <- sprintf("sim_n%d_p%d", n, p)

    for (target in as.integer(true_ncomp)) {
      for (rep_id in seq_len(reps)) {
        counter <- counter + 1L
        sim_seed <- .seed_with_offset(seed, counter * 1000L)
        sim <- simulate_pls_data(n = n, p = p, true_ncomp = target, seed = sim_seed)

        for (criterion in criteria) {
          selection_seed <- .seed_with_offset(sim_seed, match(criterion, criteria))
          timing <- system.time(
            sel <- select_ncomp(
              x = sim$x,
              y = sim$y,
              method = "complete",
              criterion = criterion,
              max_ncomp = max_ncomp,
              seed = selection_seed,
              folds = folds
            )
          )
          row_id <- row_id + 1L
          rows[[row_id]] <- .study_row(
            study = "simulation",
            dataset = dataset_name,
            n = n,
            p = p,
            true_ncomp = target,
            target_ncomp = target,
            mechanism = NA_character_,
            missing_prop = 0,
            method = "Complete",
            criterion = criterion,
            selected_ncomp = sel$selected_ncomp,
            matched_target = !is.na(sel$selected_ncomp) && sel$selected_ncomp == target,
            runtime_sec = unname(timing["elapsed"]),
            seed = selection_seed,
            status = sel$status,
            notes = sel$notes
          )
        }

        if (!length(missing_props) || !length(mechanisms)) {
          next
        }

        for (mechanism in mechanisms) {
          for (missing_prop in missing_props) {
            miss_seed <- .seed_with_offset(sim_seed, as.integer(round(.normalize_missing_prop(missing_prop) * 1000)) +
              if (mechanism == "MAR") 50000L else 0L)
            miss <- add_missingness(
              x = sim$x,
              y = sim$y,
              mechanism = mechanism,
              missing_prop = missing_prop,
              seed = miss_seed,
              mar_y_bias = mar_y_bias
            )

            for (selection_method in incomplete_methods) {
              for (criterion in criteria) {
                selection_seed <- .seed_with_offset(miss_seed, row_id + 17L)
                timing <- system.time(
                  sel <- select_ncomp(
                    x = miss$x_incomplete,
                    y = sim$y,
                    method = selection_method,
                    criterion = criterion,
                    max_ncomp = max_ncomp,
                    seed = selection_seed,
                    folds = folds
                  )
                )
                row_id <- row_id + 1L
                rows[[row_id]] <- .study_row(
                  study = "simulation",
                  dataset = dataset_name,
                  n = n,
                  p = p,
                  true_ncomp = target,
                  target_ncomp = target,
                  mechanism = mechanism,
                  missing_prop = .normalize_missing_prop(missing_prop) * 100,
                  method = .selection_method_label(selection_method),
                  criterion = criterion,
                  selected_ncomp = sel$selected_ncomp,
                  matched_target = !is.na(sel$selected_ncomp) && sel$selected_ncomp == target,
                  runtime_sec = unname(timing["elapsed"]),
                  seed = selection_seed,
                  status = sel$status,
                  notes = sel$notes
                )
              }
            }

            for (imp_method in imputation_methods) {
              imputation_seed <- .seed_with_offset(miss_seed, 100000L + row_id)
              imp <- tryCatch(
                impute_pls_data(
                  x = miss$x_incomplete,
                  method = imp_method,
                  seed = imputation_seed,
                  m = max(1L, as.integer(round(.normalize_missing_prop(missing_prop) * 100)))
                ),
                error = function(e) e
              )

              for (criterion in criteria) {
                if (inherits(imp, "error")) {
                  sel <- .selection_result(
                    selection_method = paste0("complete_", imp_method),
                    criterion = criterion,
                    selected_ncomp = NA_integer_,
                    criterion_value = NA_real_,
                    max_ncomp = max_ncomp,
                    seed = imputation_seed,
                    status = "error",
                    notes = conditionMessage(imp)
                  )
                  elapsed <- NA_real_
                } else {
                  selection_seed <- .seed_with_offset(imputation_seed, match(criterion, criteria))
                  timing <- system.time(
                    sel <- select_ncomp(
                      x = imp,
                      y = sim$y,
                      method = "complete",
                      criterion = criterion,
                      max_ncomp = max_ncomp,
                      seed = selection_seed,
                      folds = folds
                    )
                  )
                  elapsed <- unname(timing["elapsed"])
                }

                row_id <- row_id + 1L
                rows[[row_id]] <- .study_row(
                  study = "simulation",
                  dataset = dataset_name,
                  n = n,
                  p = p,
                  true_ncomp = target,
                  target_ncomp = target,
                  mechanism = mechanism,
                  missing_prop = .normalize_missing_prop(missing_prop) * 100,
                  method = .imputation_method_label(imp_method),
                  criterion = criterion,
                  selected_ncomp = sel$selected_ncomp,
                  matched_target = !is.na(sel$selected_ncomp) && sel$selected_ncomp == target,
                  runtime_sec = elapsed,
                  seed = if (inherits(imp, "error")) imputation_seed else selection_seed,
                  status = sel$status,
                  notes = sel$notes
                )
              }
            }
          }
        }
      }
    }
  }

  do.call(rbind, rows)
}

Any scripts or data that you put into this service are public.

missPLS documentation built on April 30, 2026, 9:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

missPLS
Methods and Reproducible Workflows for Partial Least Squares with Missing Data

R/run_simulation_study.R
In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data

Defines functions run_simulation_study

Documented in run_simulation_study

Try the missPLS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

missPLS Methods and Reproducible Workflows for Partial Least Squares with Missing Data

R/run_simulation_study.R In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data

Defines functions run_simulation_study

Documented in run_simulation_study

Try the missPLS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

missPLS
Methods and Reproducible Workflows for Partial Least Squares with Missing Data

R/run_simulation_study.R
In missPLS: Methods and Reproducible Workflows for Partial Least Squares with Missing Data