#' LASSO for regression and classification.
#'
#' \code{lasso} implements standard LASSO regression and classification.
#'
#' The lasso function implements LASSO regression, as found in the
#' gamlr package, for variable selection and prediction. It handles standard
#' OLS regression and binomial logistic regression.
#'
#' @param train_df An input dataframe with \code{y} and \code{X}.
#' @param formula A formula for the regression specification.
#' @param family A string that specifies either 'gaussian' or 'binomial'.
#' @param test_df A dataframe containing the same columns as \code{train_df}.
#' The training set.
#' @param predict_df A dataframe matching \code{train_df}.
#' This is to generate predictions using the trained & tested model.
#' This argument is optional.
#' @param free_vars A string or character vector specifying which covariate(s)
#' to never penalize. This argument is optional.
#' @param nfold The number of cross-validation folds. Only specify if
#' cross-validation is desired. This argument is optional.
#' @param lambda A string specifying which lambda to use for prediction.
#' Typically either "min" or "1se". Default value is "min".
#' @param sparsity_threshold A numeric value in \eqn{[0, 1]}. Any variable
#' with a percentage of sparsity greater than this value will be dropped.
#' This argument is optional.
#' @param verbose Logical indicating whether to return progress statements.
#' Default is \code{FALSE}.
#' @param ... Generic argument to which you can pass any other valid gamlr
#' argument, such as \code{standardize = FALSE}.
#' @return A list containing the LASSO \code{model}, \code{predicted_values},
#' \code{residuals}, and selected \code{variables}.
#' @examples
#' \dontrun{
#' idx <- train_test_validate(iris$Sepal.Length, train.p = .6, test.p = .2)
#'
#' lasso_model <- lasso(train_df = iris[idx$train, ],
#' formula = Sepal.Length ~ .,
#' family = "gaussian",
#' test_df = iris[idx$test, ],
#' predict_df = iris[idx$validate, ],
#' nfold = 5,
#' verbose = TRUE)
#' }
#' @export
lasso <- function(train_df,
formula,
family,
test_df = NULL,
predict_df = NULL,
free_vars = NULL,
nfold = NULL,
lambda = "min",
sparsity_threshold = NULL,
verbose = FALSE,
...) {
# Error checking for family type
assert(family %in% c("gaussian", "binomial"),
paste0("Family '", family, "' is not currently supported"))
# Create sparse matrix and remove constant cols
x <- Matrix::sparse.model.matrix(stats::as.formula(formula_rhs(formula)),
data = dplyr::select(train_df,
-formula_lhs(formula)))
x <- x[, unique(Matrix::summary(x)$j)][, -1]
# Calculate sparsity of model matrix if desired
if (!is.null(sparsity_threshold)) {
# Which vars are above sparsity threshold
trim_vars <- dplyr::pull(
dplyr::filter(
sparsity(x),
get("PERC_SPARSE") > sparsity_threshold
),
"VARIABLE"
)
# Remove these cols from input matrix
x <- x[, !colnames(x) %in% trim_vars]
if(verbose == TRUE) {cat("Removed all variables with sparsity >",
paste0(sparsity_threshold, "%"),
"\n")}
}
if (verbose == TRUE) {cat("Finished creating train model matrix\n")}
# Getting indices of unpenalized columns
free_idx <- if (is.null(free_vars)) {
NULL
} else {
grep(paste(free_vars, collapse = "|"), colnames(x))
}
# Run LASSO with/without cv
model <- if (is.null(nfold)) {
if (verbose == TRUE) {cat("Running LASSO without cross-validation\n")}
gamlr::gamlr(x = x,
y = dplyr::pull(train_df, formula_lhs(formula)),
family = family,
free = free_idx,
...)
} else {
if (verbose == TRUE) {cat("Running", nfold, "fold cross-validated LASSO\n")}
gamlr::cv.gamlr(x = x,
y = dplyr::pull(train_df, formula_lhs(formula)),
family = family,
nfold = nfold,
free = free_idx,
...)
}
# Get predicted values and residuals for test data
if (!is.null(test_df)) {
# Convert test_x into model.matrix
test_x <- Matrix::sparse.model.matrix(stats::as.formula(formula_rhs(formula)),
data = test_df)
test_x <- test_x[, colnames(x)]
if (verbose == TRUE) {cat("Finished creating test model matrix\n")}
# Predict values using best/user-specified lambda
if (verbose == TRUE) {cat("Returning results\n")}
if (family == "binomial") {
if (is.null(nfold)) {
test_values <- stats::predict(model,
test_x,
type = "response")
} else {
test_values <- stats::predict(model,
test_x,
select = lambda,
type = "response")
}
} else {
if (is.null(nfold)) {
test_values <- stats::predict(model, test_x)
} else {
test_values <- stats::predict(model,
test_x,
select = lambda)
}
}
# Generate residuals
resids <- dplyr::pull(test_df, formula_lhs(formula)) - as.vector(test_values)
}
# Get predicted values for prediction data
if (!is.null(predict_df)) {
predict_x <- Matrix::sparse.model.matrix(stats::as.formula(formula_rhs(formula)),
data = predict_df)
predict_x <- predict_x[, colnames(x)]
if (verbose == TRUE) {cat("Finished creating prediction model matrix\n")}
if (family == "binomial") {
if (is.null(nfold)) {
values <- stats::predict(model,
predict_x,
type = "response")
} else {
values <- stats::predict(model,
predict_x,
select = lambda,
type = "response")
}
} else {
if (is.null(nfold)) {
values <- stats::predict(model, predict_x)
} else {
values <- stats::predict(model,
predict_x,
select = lambda)
}
}
}
# Generate tibble for non-zero coefficients
non_zero_coef <- tibble::tibble()
# Get non-zero coefficients
c <- as.matrix(stats::coef(model))
# Bind non-zero coefficients into dataframe
non_zero_coef <- dplyr::bind_rows(
non_zero_coef,
tryCatch(
tibble::tibble("selected_variables" = names(c[which(c != 0), ])),
error = function(e) {tibble::tibble("selected_variables" = c("None"))}
)
)
non_zero_coef <- dplyr::filter(
non_zero_coef,
get("selected_variables") != "intercept"
)
# Calculate measures of accuracy
if (!is.null(test_df)) {
if (family == "binomial") {
test_auc <- ModelMetrics::auc(dplyr::pull(test_df, formula_lhs(formula)),
as.vector(test_values))
} else {
test_RMSE <- caret::RMSE(as.vector(test_values),
dplyr::pull(test_df, formula_lhs(formula)))
test_R2 <- caret::R2(as.vector(test_values),
dplyr::pull(test_df, formula_lhs(formula)))
}
}
# Return predicted values
out <- if(!is.null(test_df)) {
if (family == "binomial") {
list(model = model,
test_predicted = dplyr::bind_cols(values = as.vector(test_values)),
test_residuals = dplyr::bind_cols(residuals = as.vector(resids)),
test_auc = test_auc,
selected_variables = non_zero_coef)
} else {
list(model = model,
test_predicted = dplyr::bind_cols(values = as.vector(test_values)),
test_residuals = dplyr::bind_cols(residuals = as.vector(resids)),
test_RMSE = test_RMSE,
test_R2 = test_R2,
selected_variables = non_zero_coef)
}
} else {
list(model = model,
selected_variables = non_zero_coef)
}
# Return predicted values from predict_x if valid
if (!is.null(predict_df)) {out <- append(
out,
list(predicted = tibble::tibble(values = as.vector(values)))
)}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.