umbrella: A conglomeration of modeling and data munging functionality

Documented in bolasso

#' Bootstrapped LASSO (Bolasso) for regression and classification.
#'
#' \code{bolasso} implements Bootstrapped LASSO regression and classification.
#'
#' The bolasso function bootstraps LASSO regression, as found in the
#' gamlr package, across n bootstrap partitions. This achieves more robust
#' variable selection than standard LASSO and improves predictive accuracy.
#' It handles standard OLS regression and binomial logistic regression.
#'
#' @param train_df An input dataframe with \code{y} and \code{X}.
#' @param nboot A numeric value specifying the number of bootstrap replicates.
#' @param formula A formula for the covariates.
#' @param family A string that specifies either 'gaussian' or 'binomial'.
#' @param test_df A dataframe containing the same columns as \code{train_df}.
#'   The training set.
#' @param predict_df A dataframe matching \code{train_df}.
#'   This is to generate predictions using the trained & tested model.
#'   This argument is optional.
#' @param free_vars A string or character vector specifying which covariate(s)
#'   to never penalize. This argument is optional.
#' @param nfold The number of cross-validation folds. Only specify if
#'   cross-validation is desired. This argument is optional.
#' @param lambda A string specifying which lambda to use for prediction when
#'   utilizing cross-validation. Typically either "min" or "1se". Default
#'   value is "min".
#' @param sparsity_threshold A numeric value in \eqn{[0, 1]}. Any variable
#'   with a percentage of sparsity greater than this value will be dropped.
#'   This argument is optional.
#' @param selection_threshold A numeric value in \eqn{[0, 1]}. Variables must
#'   be selected in at least this percentage of bootstrap replicates. Default
#'   value is 0.9.
#' @param verbose Logical indicating whether to return progress statements.
#'   Default is \code{FALSE}.
#' @param ... Generic argument to which you can pass any other valid gamlr
#'   argument, such as \code{standardize = FALSE}.
#' @return A list containing the LASSO \code{model}, \code{predicted_values},
#'  \code{residuals}, and selected \code{variables}.
#' @examples 
#' \dontrun{
#' idx <- train_test_validate(iris$Sepal.Length, train.p = .6, test.p = .2)
#' 
#' initialize_parallel()
#'
#' bolasso_model <- bolasso(train_df = iris[idx$train, ],
#'                          nboot =100,
#'                          formula = Sepal.Length ~ .,
#'                          family = "gaussian",
#'                          test_df = iris[idx$test, ],
#'                          predict_df = iris[idx$validate, ],
#'                          nfold = 5,
#'                          verbose = TRUE)
#' }
#' @export
bolasso <- function(train_df,
                    nboot,
                    formula,
                    family,
                    test_df             = NULL,
                    predict_df          = NULL,
                    free_vars           = NULL,
                    nfold               = NULL,
                    lambda              = "min",
                    sparsity_threshold  = NULL,
                    selection_threshold = .9,
                    verbose             = FALSE,
                    ...) {
  # Initialize vector for n bootstrap reps
  replicates <- caret::createResample(train_df[[1]], nboot)
  if (verbose == TRUE) {cat("Finished creating", nboot, "bootstrap samples\n")}
  # Run lasso function over bootstrap folds
  # Return a list of 100 predicted values, residuals, and selected variables
  if (verbose == TRUE) {cat("Running LASSO on bootstrap samples\n")}
  model_list <- future.apply::future_lapply(
    replicates,
    function(i) {
      # Subset input by indexes
      train_df <- train_df[i, ]
      model <- suppressMessages(
        lasso(train_df           = train_df,
              formula            = formula,
              family             = family,
              test_df            = test_df,
              predict_df         = predict_df,
              free_vars          = free_vars,
              nfold              = nfold,
              sparsity_threshold = sparsity_threshold,
              lambda             = lambda,
              ...                = ...)
      )
      return(model)
    })
  if (verbose == TRUE) {cat("Returning results\n")}
  if (!is.null(test_df)) {
    # Bind all 100 predicted values into dataframe
    test_predicted <- suppressMessages(
      dplyr::bind_cols(
        lapply(
          model_list,
          function(j) {
            j$test_predicted
          }
        )
      )
    )
    # Rename predicted_values columns
    colnames(test_predicted) <- 1:nboot
    # Calculate average predicted values across all folds
    avg_test_predicted <- rowMeans(test_predicted)
    # Return bolasso residuals as columns
    test_residuals <- suppressMessages(
      dplyr::bind_cols(
        lapply(
          model_list,
          function(j) {
            j$test_residuals
          }
        )
      )
    )
    # Rename residuals columns
    colnames(test_residuals) <- 1:nboot
    # Gather bolasso selected coefficients
    selected_variables <- dplyr::bind_rows(
      lapply(
        model_list,
        function(j) {
          j$selected_variables
        }
      )
    )
  } else {
    # Gather bolasso selected coefficients
    selected_variables <- dplyr::bind_rows(
      lapply(
        model_list,
        function(j) {
          j$selected_variables
        }
      )
    )
  }
  # Get average predicted predict_x values
  if (!is.null(predict_df)) {
    # Bind all 100 predicted values into dataframe
    predicted <- suppressMessages(
      dplyr::bind_cols(
        lapply(
          model_list,
          function(j) {
            j$predicted
          }
        )
      )
    )
    # Rename predicted_values columns
    colnames(predicted) <- 1:nboot
    # Calculate average predicted values across all folds
    avg_predicted <- rowMeans(test_predicted)
  }
  # Calculate selected variables
  selected_variables <- dplyr::select(
    dplyr::filter(
      dplyr::mutate(
        dplyr::count(
          selected_variables,
          selected_variables
        ),
        "perc_chosen" = get("n") / nboot
      ),
      get("perc_chosen") >= selection_threshold
    ),
    selected_variables
  )
  # Calculate measures of accuracy
  if (!is.null(test_df)) {
    if (family == "binomial") {
      test_auc <- ModelMetrics::auc(dplyr::pull(test_df, formula_lhs(formula)), 
                                    avg_test_predicted)
    } else {
      test_RMSE <- caret::RMSE(avg_test_predicted,
                               dplyr::pull(test_df, formula_lhs(formula)))
      test_R2   <- caret::R2(avg_test_predicted,
                             dplyr::pull(test_df, formula_lhs(formula)))
    }
  }
  # Return list of outputs
  out <- if (!is.null(test_df)) {
    if (family == "binomial") {
      list(
        avg_test_predicted = suppressMessages(
          dplyr::bind_cols(
            values = avg_test_predicted
          )
        ),
        test_predicted     = test_predicted,
        test_residuals     = test_residuals,
        test_auc           = test_auc,
        selected_variables = selected_variables
      )
    } else {
      list(
        avg_test_predicted = suppressMessages(
          dplyr::bind_cols(
            values = avg_test_predicted
          )
        ),
        test_predicted     = test_predicted,
        test_residuals     = test_residuals,
        test_RMSE          = test_RMSE,
        test_R2            = test_R2,
        selected_variables = selected_variables
      )
    }
  } else {
    list(
      selected_variables = selected_variables
    )
  }
  # Return predicted values from predict_x if valid
  if (!is.null(predict_df)) {out <- append(
    out,
    list(avg_predicted = tibble::tibble(values = as.vector(avg_predicted)),
         predicted     = predicted)
  )}
  return(out)
}