yamatClassifier: Yet Another Methylation Array Toolkit: Classifier

Documented in train_model4 train_model4_inner_fold train_model4_outer_fold

# Training model 4

#' Train model 4
#'
#' Similar to model 1, but with tentative features from Boruta.
#'
#' @param dat a \code{data.frame} of input data.
#' @param response_name column name of the response.
#' @param feature_selection feature selection method.
#' @param outer_cv_folds outer cross-validation fold number.
#' @param inner_cv_folds inner cross-validation fold number.
#' @param random_state random seed.
#' @param mtry A vector of mtry for parameter tuning.
#' @param save_level if save_level > 0, save outer train index. If save_level > 1,
#'   save calibrated probabilities and selected features in addition.
#' @param save_prefix output file prefix.
#' @param overwrite overwrite existing result files or not.
#' @param output output directory.
#' @param verbose A bool.
#' @return a list of cross-validation result of given \code{mtry} values.
#' @details Key steps:
#'   \enumerate{
#'     \item Tuning loop tunes single parameter \code{mtry}.
#'     \item Outer cross-validation split the data set into training and testing
#'       set of M folds.
#'     \item Inner cross-validation split the training set of the outer CV into
#'       N folds. Each fold does the feature selection with Boruta algorithm and
#'       random forest classification. When all folds are done. Train calibration
#'       model of Ridge multinomial logistic regression (MR) regression. The
#'       lambda is trained with \code{\link[glmnet]{cv.glmnet}}. The random
#'       forest and calibration models are used for the testing set of the outer
#'       CV.
#'   }
#' @export
train_model4 <- function(dat,
                         response_name,
                         outer_cv_folds = 5,
                         inner_cv_folds = 5,
                         random_state = 56,
                         mtry = NULL,
                         save_level = 3,
                         save_prefix = "train_model4_",
                         overwrite = FALSE,
                         output = NULL,
                         verbose = TRUE) {
  if (save_level > 0 && is.null(output)) {
    stop("output is required when save level > 0")
  }
  if (!is.null(output)) {
    dir.create(output, recursive = TRUE)
  }
  if (!(response_name %in% colnames(dat))) {
    stop(glue::glue("fail to find variable {response_name}"))
  }
  responses <- factor(dat[, response_name, drop = TRUE])
  x <- dat[, -(which(colnames(dat) == response_name)), drop = FALSE]
  if (is.null(mtry)) {
    warning("mtry is not set and set to default")
    mtry <- 0
  }
  set.seed(random_state)
  outer_train_indexes_rda <- file.path(output,
                                       glue::glue("{save_prefix}outer_train_indexes.Rda"))
  if (file.exists(outer_train_indexes_rda)) {
    if (!overwrite) {
      logger::log_debug(
        glue::glue(
          "Loading outer CV folds: {outer_cv_folds} folds and random_state={random_state}"
        )
      )
      load(outer_train_indexes_rda)
    }

  } else {
    logger::log_debug(
      glue::glue(
        "Creating outer CV folds: {outer_cv_folds} folds and random_state={random_state}"
      )
    )
    outer_train_indexes <- caret::createFolds(responses,
                                              k = outer_cv_folds,
                                              list = TRUE,
                                              returnTrain = TRUE)
    if (save_level > 0) {
      save(outer_train_indexes, file = outer_train_indexes_rda)
    }
  }
  tune_result <- lapply(mtry, function(mtry_i) {
    if (mtry_i == 0) {
      rf_grid <- NULL
      mtry_i <- "default"
    } else {
      rf_grid <- expand.grid(mtry = mtry_i)
    }
    logger::log_debug(glue::glue("Use random forest parameters mtry={mtry_i}"))
    cv_result <- lapply(seq(length(outer_train_indexes)), function(i) {
      logger::log_debug(glue::glue("Outer fold #{i}"))
      calibrated_prob_response_i_rda <- file.path(
        output,
        glue::glue(
          "{save_prefix}calibrated_prob_response_mtry_{mtry_i}_fold_{i}.Rda"
        )
      )
      if (file.exists(calibrated_prob_response_i_rda)) {
        if (!overwrite) {
          logger::log_debug(
            glue::glue(
              "Loading calibrated prob from {calibrated_prob_response_i_rda}"
            )
          )
          load(calibrated_prob_response_i_rda)
          return(calibrated_prob_response)
        }
      }
      outer_train_index <- outer_train_indexes[[i]]
      outer_fold_result <- train_model4_outer_fold(
        dat = dat,
        response_name = response_name,
        outer_train_index = outer_train_index,
        random_state = random_state + i,
        inner_cv_folds = inner_cv_folds,
        rf_grid = rf_grid,
        verbose = verbose
      )
      calibrated_prob_response <- outer_fold_result$calibrated_prob_response
      gc()
      if (save_level > 1) {
        logger::log_debug(
          glue::glue(
            "Saving calibrated prob and selected features into {calibrated_prob_response_i_rda}"
          )
        )
        selected_features <- outer_fold_result$selected_features
        save(calibrated_prob_response, selected_features, file = calibrated_prob_response_i_rda)
      }
      calibrated_prob_response
    })
    do.call(rbind, cv_result)
  })
  return(tune_result)
}


#' Train model 4 - Process the outer fold
#'
#' @param dat a \code{data.frame} of input data.
#' @param response_name column name of the response.
#' @param outer_train_index outer train sample indexes.
#' @param random_state random seed.
#' @param inner_cv_folds inner cross-validation fold number.
#' @param rf_grid A data frame with possible tuning values. See \code{\link[caret]{train}}.
#' @param verbose A bool.
#' @return a list of two attributes, a \code{data.frame} of calibrated
#'   probabilities and response variable, and a vector of selected features.
train_model4_outer_fold <- function(dat,
                                    response_name,
                                    outer_train_index,
                                    random_state,
                                    inner_cv_folds = 5,
                                    rf_grid = NULL,
                                    verbose = TRUE,
                                    ...) {
  outer_train <- dat[outer_train_index, ]
  outer_test <- dat[-outer_train_index, ]
  y <- outer_train[, response_name, drop = TRUE]
  logger::log_debug("Downsampling by the minimum of sample number across the response levels")
  min_n_sample <- min(table(y))
  set.seed(random_state + 1)
  outer_train_downsampled <- outer_train %>%
    dplyr::group_by(across(all_of(response_name))) %>%
    dplyr::sample_n(size = min_n_sample) %>%
    dplyr::ungroup()
  logger::log_debug("Selecting features with Boruta algorithm")
  selected_features <- select_features_boruta(outer_train_downsampled,
                                              response_name = response_name,
                                              with_tentative = TRUE)
  outer_train <- outer_train[, c(selected_features, response_name)]
  logger::log_debug("Inner cross validation")
  mods <- train_model4_inner_fold(
    outer_train,
    response_name = response_name,
    inner_cv_folds = inner_cv_folds,
    random_state = (random_state + 2),
    rf_grid = rf_grid,
    verbose = verbose
  )
  logger::log_debug("Run random forest model on outer fold testing data set")
  predicted_probs <- predict(mods$rf_model, newdata = outer_test, type = "prob") %>%
    as.matrix()
  logger::log_debug("Run calibration model on outer fold testing data set")
  calibrated_probs <- predict(
    mods$calibration_model,
    newx = predicted_probs,
    s = "lambda.min",
    type = "response"
  )
  calibrated_prob_response <- cbind(calibrated_probs, outer_test[, response_name, drop = FALSE])
  gc()
  return(
    list(
      selected_features = selected_features,
      calibrated_prob_response = calibrated_prob_response
    )
  )
}


#' Train model 4 - process the inner fold
#'
#' @param outer_train a \code{data.frame} of one fold of training set during
#'   nested cross-validation.
#' @param response_name column name of the response.
#' @param random_state random seed.
#' @param inner_cv_folds inner cross-validation fold number.
#' @param rf_grid A data frame with possible tuning values. See \code{\link[caret]{train}}.
#' @param verbose A bool.
#' @return a list of two models of random forest and calibration model respectively.
train_model4_inner_fold <- function(outer_train,
                                    response_name,
                                    inner_cv_folds = 5,
                                    random_state,
                                    rf_grid = NULL,
                                    verbose = TRUE,
                                    ...) {
  responses <- factor(outer_train[, response_name, drop = TRUE])
  train_control <- caret::trainControl(
    method = "cv",
    number = inner_cv_folds,
    classProbs = TRUE,
    summaryFunction = caret::multiClassSummary
  )
  # Train the Random Forest model
  logger::log_debug("Train random forest model")
  set.seed(random_state)
  input_formula <- stats::as.formula(paste(response_name, "~ ."))
  rf_model <- caret::train(
    input_formula,
    data = outer_train,
    method = "rf",
    trControl = train_control,
    tuneGrid = rf_grid,
    verbose = verbose
  )

  # Get predicted probabilities on the training data
  logger::log_debug("Train Ridge regression for calibration")
  predicted_probs <- predict(rf_model, newdata = outer_train, type = "prob") %>%
    as.matrix()
  # ridge_multiclass_model <- glmnet::glmnet(predicted_probs,
  #                                          outer_train[, response_name, drop = TRUE],
  #                                          family = "multinomial",
  #                                          alpha = 0)
  # calibrated_probs <- predict(ridge_multiclass_model,
  #                             newx = predicted_probs,
  #                             type = "response")
  cv_ridge_multiclass <- glmnet::cv.glmnet(predicted_probs,
                                           outer_train[, response_name, drop = TRUE],
                                           family = "multinomial",
                                           alpha = 0)
  # calibrated_probs <- predict(cv_ridge_multiclass,
  #                             newx = predicted_probs,
  #                             s = "lambda.min",
  #                             type = "response")
  gc()
  return(list(rf_model = rf_model, calibration_model = cv_ridge_multiclass))
}