umbrella: A conglomeration of modeling and data munging functionality

Documented in lasso

#' LASSO for regression and classification.
#'
#' \code{lasso} implements standard LASSO regression and classification.
#'
#' The lasso function implements LASSO regression, as found in the
#' gamlr package, for variable selection and prediction. It handles standard
#' OLS regression and binomial logistic regression.
#'
#' @param train_df An input dataframe with \code{y} and \code{X}.
#' @param formula A formula for the regression specification.
#' @param family A string that specifies either 'gaussian' or 'binomial'.
#' @param test_df A dataframe containing the same columns as \code{train_df}.
#'   The training set.
#' @param predict_df A dataframe matching \code{train_df}.
#'   This is to generate predictions using the trained & tested model.
#'   This argument is optional.
#' @param free_vars A string or character vector specifying which covariate(s)
#'   to never penalize. This argument is optional.
#' @param nfold The number of cross-validation folds. Only specify if
#'   cross-validation is desired. This argument is optional.
#' @param lambda A string specifying which lambda to use for prediction.
#'   Typically either "min" or "1se". Default value is "min".
#' @param sparsity_threshold A numeric value in \eqn{[0, 1]}. Any variable
#'   with a percentage of sparsity greater than this value will be dropped.
#'   This argument is optional.
#' @param verbose Logical indicating whether to return progress statements.
#'   Default is \code{FALSE}.
#' @param ... Generic argument to which you can pass any other valid gamlr
#'   argument, such as \code{standardize = FALSE}.
#' @return A list containing the LASSO \code{model}, \code{predicted_values},
#'  \code{residuals}, and selected \code{variables}.
#' @examples 
#' \dontrun{
#' idx <- train_test_validate(iris$Sepal.Length, train.p = .6, test.p = .2)
#'
#' lasso_model <- lasso(train_df = iris[idx$train, ],
#'                      formula = Sepal.Length ~ .,
#'                      family = "gaussian",
#'                      test_df = iris[idx$test, ],
#'                      predict_df = iris[idx$validate, ],
#'                      nfold = 5,
#'                      verbose = TRUE)
#' }
#' @export
lasso <- function(train_df,
                  formula,
                  family,
                  test_df            = NULL,
                  predict_df         = NULL,
                  free_vars          = NULL,
                  nfold              = NULL,
                  lambda             = "min",
                  sparsity_threshold = NULL,
                  verbose            = FALSE,
                  ...) {
  # Error checking for family type
  assert(family %in% c("gaussian", "binomial"),
         paste0("Family '", family, "' is not currently supported"))
  # Create sparse matrix and remove constant cols
  x <- Matrix::sparse.model.matrix(stats::as.formula(formula_rhs(formula)), 
                                   data = dplyr::select(train_df,
                                                        -formula_lhs(formula)))
  x <- x[, unique(Matrix::summary(x)$j)][, -1]
  # Calculate sparsity of model matrix if desired
  if (!is.null(sparsity_threshold)) {
    # Which vars are above sparsity threshold
    trim_vars <- dplyr::pull(
      dplyr::filter(
        sparsity(x),
        get("PERC_SPARSE") > sparsity_threshold
      ),
      "VARIABLE"
    )
    # Remove these cols from input matrix
    x <- x[, !colnames(x) %in% trim_vars]
    if(verbose == TRUE) {cat("Removed all variables with sparsity >",
                             paste0(sparsity_threshold, "%"),
                             "\n")}
  }
  if (verbose == TRUE) {cat("Finished creating train model matrix\n")}
  # Getting indices of unpenalized columns
  free_idx <- if (is.null(free_vars)) {
    NULL
  } else {
    grep(paste(free_vars, collapse = "|"), colnames(x))
  }
  # Run LASSO with/without cv
  model <- if (is.null(nfold)) {
    if (verbose == TRUE) {cat("Running LASSO without cross-validation\n")}
    gamlr::gamlr(x      = x,
                 y      = dplyr::pull(train_df, formula_lhs(formula)),
                 family = family,
                 free   = free_idx,
                 ...)
  } else {
    if (verbose == TRUE) {cat("Running", nfold, "fold cross-validated LASSO\n")}
    gamlr::cv.gamlr(x      = x,
                    y      = dplyr::pull(train_df, formula_lhs(formula)),
                    family = family,
                    nfold  = nfold,
                    free   = free_idx,
                    ...)
  }
  # Get predicted values and residuals for test data
  if (!is.null(test_df)) {
  # Convert test_x into model.matrix
  test_x <- Matrix::sparse.model.matrix(stats::as.formula(formula_rhs(formula)), 
                                        data = test_df)
  test_x <- test_x[, colnames(x)]
  if (verbose == TRUE) {cat("Finished creating test model matrix\n")}
  # Predict values using best/user-specified lambda
  if (verbose == TRUE) {cat("Returning results\n")}
  if (family == "binomial") {
    if (is.null(nfold)) {
      test_values <- stats::predict(model,
                                    test_x,
                                    type = "response")
    } else {
      test_values <- stats::predict(model,
                                    test_x,
                                    select = lambda,
                                    type = "response")
    }
  } else {
    if (is.null(nfold)) {
      test_values <- stats::predict(model, test_x)
    } else {
      test_values <- stats::predict(model,
                                    test_x,
                                    select = lambda)
    }
  }
  # Generate residuals
  resids <- dplyr::pull(test_df, formula_lhs(formula)) - as.vector(test_values)
  }
  # Get predicted values for prediction data
  if (!is.null(predict_df)) {
    predict_x <- Matrix::sparse.model.matrix(stats::as.formula(formula_rhs(formula)), 
                                             data = predict_df)
    predict_x <- predict_x[, colnames(x)]
    if (verbose == TRUE) {cat("Finished creating prediction model matrix\n")}
    if (family == "binomial") {
      if (is.null(nfold)) {
        values <- stats::predict(model,
                                 predict_x,
                                 type = "response")
      } else {
        values <- stats::predict(model,
                                 predict_x,
                                 select = lambda,
                                 type = "response")
      }
    } else {
      if (is.null(nfold)) {
        values <- stats::predict(model, predict_x)
      } else {
        values <- stats::predict(model,
                                 predict_x,
                                 select = lambda)
      }
    }
  }
  # Generate tibble for non-zero coefficients
  non_zero_coef <- tibble::tibble()
  # Get non-zero coefficients
  c <- as.matrix(stats::coef(model))
  # Bind non-zero coefficients into dataframe
  non_zero_coef <- dplyr::bind_rows(
    non_zero_coef,
    tryCatch(
      tibble::tibble("selected_variables" = names(c[which(c != 0), ])),
      error = function(e) {tibble::tibble("selected_variables" = c("None"))}
    )
  )
  non_zero_coef <- dplyr::filter(
    non_zero_coef,
    get("selected_variables") != "intercept"
  )
  # Calculate measures of accuracy
  if (!is.null(test_df)) {
    if (family == "binomial") {
      test_auc <- ModelMetrics::auc(dplyr::pull(test_df, formula_lhs(formula)), 
                                    as.vector(test_values))
    } else {
      test_RMSE <- caret::RMSE(as.vector(test_values), 
                               dplyr::pull(test_df, formula_lhs(formula)))
      test_R2   <- caret::R2(as.vector(test_values), 
                             dplyr::pull(test_df, formula_lhs(formula)))
    }
  }
  # Return predicted values
  out <- if(!is.null(test_df)) {
    if (family == "binomial") {
      list(model = model,
           test_predicted = dplyr::bind_cols(values = as.vector(test_values)),
           test_residuals = dplyr::bind_cols(residuals = as.vector(resids)),
           test_auc = test_auc,
           selected_variables = non_zero_coef)
    } else {
      list(model = model,
           test_predicted = dplyr::bind_cols(values = as.vector(test_values)),
           test_residuals = dplyr::bind_cols(residuals = as.vector(resids)),
           test_RMSE = test_RMSE,
           test_R2 = test_R2,
           selected_variables = non_zero_coef)
    }
  } else {
    list(model = model,
         selected_variables = non_zero_coef)
  }
  # Return predicted values from predict_x if valid
  if (!is.null(predict_df)) {out <- append(
    out,
    list(predicted = tibble::tibble(values = as.vector(values)))
  )}
  return(out)
}