R/run_pca.R

Defines functions run_pca

Documented in run_pca

#' Apply PCA classifier to MrP.
#'
#' \code{run_pca} is a wrapper function that applies the PCA classifier to data
#' provided by the user, evaluates prediction performance, and chooses the
#' best-performing model.
#'
#' @inheritParams auto_MrP
#' @param data Data for cross-validation. A \code{list} of \eqn{k}
#'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
#'   cross-validation.
#'
#' @return A model formula of the winning best subset classifier model.

run_pca <- function(y, L1.x, L2.x, L2.unit, L2.reg,
                    loss.unit, loss.fun, data, cores,
                    verbose) {

  # List of all models to be evaluated
  models <- model_list_pca(
      y = y,
      L1.x = L1.x,
      L2.x = L2.x,
      L2.unit = L2.unit,
      L2.reg = L2.reg)

  # prallel tuning if cores > 1
  if( cores > 1 ){

    # Train all models in parallel
    m_errors <- run_best_subset_mc(
      verbose = verbose,
      models = models,
      data = data,
      loss.unit = loss.unit,
      loss.fun = loss.fun,
      y = y,
      L1.x = L1.x,
      L2.x = L2.x,
      L2.unit = L2.unit,
      L2.reg = L2.reg,
      cores = cores)
  } else{
    # Train and evaluate each model
    m_errors <- lapply(seq_along(models), function(m) {
      # Print model m
      if (isTRUE(verbose)) {
        M <- length(models)
        cat(paste("Best subset: Running model ", m,
                  " out of ", M, " models\n", sep = ""))
      }

      # Loop over each fold
      k_errors <- lapply(seq_along(data), function(k) {
        # Split data in training and validation sets
        data_train <- dplyr::bind_rows(data[-k])
        data_valid <- dplyr::bind_rows(data[k])

        # Train mth model on kth training set
        model_m <- best_subset_classifier(model = models[[m]],
                                          data.train = data_train,
                                          model.family = binomial(link = "probit"),
                                          model.optimizer = "bobyqa",
                                          n.iter = 1000000,
                                          verbose = verbose)

        # Use trained model to make predictions for kth validation set
        pred_m <- stats::predict(model_m, newdata = data_valid,
                                 type = "response", allow.new.levels = TRUE)

        # Evaluate predictions based on loss function
        perform_m <- loss_function(pred = pred_m,
                                   data.valid = data_valid,
                                   loss.unit = loss.unit,
                                   loss.fun = loss.fun,
                                   y = y,
                                   L2.unit = L2.unit)
      })

      # Mean over loss functions
      k_errors <- dplyr::bind_rows(k_errors) %>%
        dplyr::group_by(measure) %>%
        dplyr::summarise(value = mean(value), .groups = "drop") %>%
        dplyr::mutate(model = m)
    })
  }

  # Extract best tuning parameters
  grid_cells <- dplyr::bind_rows(m_errors)
  best_params <- dplyr::slice(loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1)

  # Choose best-performing model
  out <- models[[ dplyr::pull(.data = best_params, var = model) ]]

  # Function output
  return(out)

}

Try the autoMrP package in your browser

Any scripts or data that you put into this service are public.

autoMrP documentation built on Aug. 17, 2023, 5:07 p.m.