#' Bootstrapped LASSO (Bolasso) for regression and classification.
#'
#' \code{bolasso} implements Bootstrapped LASSO regression and classification.
#'
#' The bolasso function bootstraps LASSO regression, as found in the
#' gamlr package, across n bootstrap partitions. This achieves more robust
#' variable selection than standard LASSO and improves predictive accuracy.
#' It handles standard OLS regression and binomial logistic regression.
#'
#' @param train_df An input dataframe with \code{y} and \code{X}.
#' @param nboot A numeric value specifying the number of bootstrap replicates.
#' @param formula A formula for the covariates.
#' @param family A string that specifies either 'gaussian' or 'binomial'.
#' @param test_df A dataframe containing the same columns as \code{train_df}.
#' The training set.
#' @param predict_df A dataframe matching \code{train_df}.
#' This is to generate predictions using the trained & tested model.
#' This argument is optional.
#' @param free_vars A string or character vector specifying which covariate(s)
#' to never penalize. This argument is optional.
#' @param nfold The number of cross-validation folds. Only specify if
#' cross-validation is desired. This argument is optional.
#' @param lambda A string specifying which lambda to use for prediction when
#' utilizing cross-validation. Typically either "min" or "1se". Default
#' value is "min".
#' @param sparsity_threshold A numeric value in \eqn{[0, 1]}. Any variable
#' with a percentage of sparsity greater than this value will be dropped.
#' This argument is optional.
#' @param selection_threshold A numeric value in \eqn{[0, 1]}. Variables must
#' be selected in at least this percentage of bootstrap replicates. Default
#' value is 0.9.
#' @param verbose Logical indicating whether to return progress statements.
#' Default is \code{FALSE}.
#' @param ... Generic argument to which you can pass any other valid gamlr
#' argument, such as \code{standardize = FALSE}.
#' @return A list containing the LASSO \code{model}, \code{predicted_values},
#' \code{residuals}, and selected \code{variables}.
#' @examples
#' \dontrun{
#' idx <- train_test_validate(iris$Sepal.Length, train.p = .6, test.p = .2)
#'
#' initialize_parallel()
#'
#' bolasso_model <- bolasso(train_df = iris[idx$train, ],
#' nboot =100,
#' formula = Sepal.Length ~ .,
#' family = "gaussian",
#' test_df = iris[idx$test, ],
#' predict_df = iris[idx$validate, ],
#' nfold = 5,
#' verbose = TRUE)
#' }
#' @export
bolasso <- function(train_df,
nboot,
formula,
family,
test_df = NULL,
predict_df = NULL,
free_vars = NULL,
nfold = NULL,
lambda = "min",
sparsity_threshold = NULL,
selection_threshold = .9,
verbose = FALSE,
...) {
# Initialize vector for n bootstrap reps
replicates <- caret::createResample(train_df[[1]], nboot)
if (verbose == TRUE) {cat("Finished creating", nboot, "bootstrap samples\n")}
# Run lasso function over bootstrap folds
# Return a list of 100 predicted values, residuals, and selected variables
if (verbose == TRUE) {cat("Running LASSO on bootstrap samples\n")}
model_list <- future.apply::future_lapply(
replicates,
function(i) {
# Subset input by indexes
train_df <- train_df[i, ]
model <- suppressMessages(
lasso(train_df = train_df,
formula = formula,
family = family,
test_df = test_df,
predict_df = predict_df,
free_vars = free_vars,
nfold = nfold,
sparsity_threshold = sparsity_threshold,
lambda = lambda,
... = ...)
)
return(model)
})
if (verbose == TRUE) {cat("Returning results\n")}
if (!is.null(test_df)) {
# Bind all 100 predicted values into dataframe
test_predicted <- suppressMessages(
dplyr::bind_cols(
lapply(
model_list,
function(j) {
j$test_predicted
}
)
)
)
# Rename predicted_values columns
colnames(test_predicted) <- 1:nboot
# Calculate average predicted values across all folds
avg_test_predicted <- rowMeans(test_predicted)
# Return bolasso residuals as columns
test_residuals <- suppressMessages(
dplyr::bind_cols(
lapply(
model_list,
function(j) {
j$test_residuals
}
)
)
)
# Rename residuals columns
colnames(test_residuals) <- 1:nboot
# Gather bolasso selected coefficients
selected_variables <- dplyr::bind_rows(
lapply(
model_list,
function(j) {
j$selected_variables
}
)
)
} else {
# Gather bolasso selected coefficients
selected_variables <- dplyr::bind_rows(
lapply(
model_list,
function(j) {
j$selected_variables
}
)
)
}
# Get average predicted predict_x values
if (!is.null(predict_df)) {
# Bind all 100 predicted values into dataframe
predicted <- suppressMessages(
dplyr::bind_cols(
lapply(
model_list,
function(j) {
j$predicted
}
)
)
)
# Rename predicted_values columns
colnames(predicted) <- 1:nboot
# Calculate average predicted values across all folds
avg_predicted <- rowMeans(test_predicted)
}
# Calculate selected variables
selected_variables <- dplyr::select(
dplyr::filter(
dplyr::mutate(
dplyr::count(
selected_variables,
selected_variables
),
"perc_chosen" = get("n") / nboot
),
get("perc_chosen") >= selection_threshold
),
selected_variables
)
# Calculate measures of accuracy
if (!is.null(test_df)) {
if (family == "binomial") {
test_auc <- ModelMetrics::auc(dplyr::pull(test_df, formula_lhs(formula)),
avg_test_predicted)
} else {
test_RMSE <- caret::RMSE(avg_test_predicted,
dplyr::pull(test_df, formula_lhs(formula)))
test_R2 <- caret::R2(avg_test_predicted,
dplyr::pull(test_df, formula_lhs(formula)))
}
}
# Return list of outputs
out <- if (!is.null(test_df)) {
if (family == "binomial") {
list(
avg_test_predicted = suppressMessages(
dplyr::bind_cols(
values = avg_test_predicted
)
),
test_predicted = test_predicted,
test_residuals = test_residuals,
test_auc = test_auc,
selected_variables = selected_variables
)
} else {
list(
avg_test_predicted = suppressMessages(
dplyr::bind_cols(
values = avg_test_predicted
)
),
test_predicted = test_predicted,
test_residuals = test_residuals,
test_RMSE = test_RMSE,
test_R2 = test_R2,
selected_variables = selected_variables
)
}
} else {
list(
selected_variables = selected_variables
)
}
# Return predicted values from predict_x if valid
if (!is.null(predict_df)) {out <- append(
out,
list(avg_predicted = tibble::tibble(values = as.vector(avg_predicted)),
predicted = predicted)
)}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.