#' Best Subset Selection for Generalized Linear Models
#'
#' \code{beset_glm} performs best subset selection using repeated
#' cross-validation to find the optimal number of predictors for several
#' families of generalized linear models.
#'
#' \code{beset_glm} performs best subset selection for generalized linear
#' models, fitting a separate model for each possible combination of predictors
#' (all models that contain exactly 1 predictor, all models that contain
#' exactly 2 predictors, and so forth). For each number of predictors,
#' \code{beset_glm} first picks the model with the best fit and then
#' estimates how well this model predicts new data using \code{k}-fold
#' cross-validation (how well, on average, a model trained using \eqn{k - 1}
#' folds predicts the left-out fold).
#'
#' @section Cross-validation details:
#' \code{beset_glm} randomly partitions the data set into \code{n_folds} *
#' \code{n_repeats} folds within strata (factor levels for factor outcomes,
#' percentile-based groups for numeric outcomes). This insures that the folds
#' will be matched in terms of the outcome's frequency distribution.
#' \code{beset_glm} also insures the reproducibility of your analysis by
#' requiring a \code{seed} to the random number generator as one of its
#' arguments.
#'
#' @section List of available families and link functions:
#' \describe{
#' \item{\code{"gaussian"}}{The Gaussian family accepts the links
#' \code{"identity"} (default), \code{"log"}, and \code{"inverse"}.}
#' \item{\code{"binomial"}}{The binomial family accepts the links
#' \code{"logit"} (default), \code{"probit"}, \code{"cauchit"}, \code{"log"}, and
#' \code{"cloglog"} (complementary log-log).}
#' \item{\code{"poisson"}}{The Poisson family accepts the links \code{"log"}
#' (default), \code{"sqrt"}, and \code{"identity"}.}
#' \item{\code{"negbin"}}{The negative binomial family accepts the links
#' \code{"log"} (default), \code{"sqrt"}, and \code{"identity"}.}
#' }
#'
#' @section Warnings:
#' \enumerate{
#' \item \code{beset_glm} handles missing data by performing listwise deletion.
#' No other options for handling missing data are provided at this time. The
#' user is encouraged to deal with missing values prior to running this
#' function.
#' \item \code{beset_glm} is intended for use with additive models only. An
#' exhaustive search over the space of possible interactions and/or non-linear
#' effects is computationally prohibitive, but I hope to offer a greedy search
#' option in the future. In the meantime and in general, I would recommend the
#' \href{https://cran.r-project.org/web/packages/earth/index.html}{MARS}
#' technique if you require this feature.
#' \item \code{beset_glm} is best suited for searching over a small number of
#' predictors (less than 10). For a large number of predictors (more than 20),
#' \code{\link{beset_elnet}} is recommended instead. However, note that
#' \code{\link{beset_elnet}} only works with a more restricted set of
#' distributions.
#' }
#'
#' @name beset_glm
#' @import parallel
#' @import purrr
#' @importFrom utils combn
#' @importFrom stats .getXlevels IQR coef coefficients delete.response
#' formula glm.control glm.fit lm.wfit make.link median
#' model.frame model.matrix model.offset model.response model.weights na.omit
#' na.pass optim predict quantile residuals sd terms var
#' @import dplyr
#'
#'
#' @seealso \code{\link[stats]{glm}},
#' \code{\link[base]{set.seed}}, \code{\link[MASS]{glm.nb}}
#'
#' @param form A model \code{\link[stats]{formula}}.
#'
#' @param data Either a \code{\link{data_partition}} object containing data sets
#' to be used for both model training and testing, or a single data frame that
#' will be used for model training and cross-validation.
#'
#' @param family Character string naming the error distribution to be used in
#' the model. Available families are listed under 'List of available families
#' and link functions'.
#'
#' @param link (Optional) character string naming the link function to be used in
#' the model. Available links and their defaults differ by \code{family} and are
#' listed under 'List of available families and link functions'.
#'
#' @param p_max Maximum number of predictors to attempt to fit. Default is 10.
#'
#' @param force_in (Optional) character vector containing the names of any
#' predictor variables that should be included in every model. (Note that if
#' there is an intercept, it is forced into every model by default.)
#'
#' @param nest_cv \code{Logical} value indicating whether to perform nested
#' cross-validation. If \code{nest_cv = TRUE}, the cross-validation used to
#' select the best model is nested within a cross-validation used to estimate
#' prediction error on a new sample, thus providing as estimate of test error
#' that is free from potential selection bias. Because this multiplicatively
#' increases compute times by a factor equal to the number of folds, the
#' default is \code{FALSE}. Note that setting this parameter to \code{TRUE} will
#' provide more informative summary output regarding the uncertatinty in the
#' selection procedure itself, i.e., how often a given model is chosen as
#' "best" according to the given criteria, and is necessary in order for the
#' returned objects to work with certain \code{beset} methods, such as
#' \code{\link{compare}} and \code{\link{importance}}.
#'
#' @param n_folds \code{Integer} indicating the number of folds to use for
#' cross-validation.
#'
#' @param n_reps \code{Integer} indicating the number of times cross-validation should
#' be repeated (with different randomized fold assignments).
#'
#' @param seed \code{Integer} used to seed the random number generator when
#' assigning observations to folds.
#'
#' @param contrasts (Optional) \code{list}. See the \code{contrasts.arg} of
#' \code{\link[stats]{model.matrix.default}}.
#'
#' @param offset (Optional) \code{vector} of length \code{nobs} specifying an
#' \emph{a priori} known component that will be added to the
#' linear predictor before applying the link function. Useful for
#' the "\code{poisson}" family (e.g. log of exposure time), or for
#' refining a model by starting at a current fit. Default is
#' \code{NULL}.
#'
#' @param weights (Optional) \code{numeric vector} of prior weights placed on
#' the observations during model fitting. Default is \code{NULL}.
#'
#' @param start (Optional) starting values for the parameters in the linear
#' predictor.
#'
#' @param etastart (Optional) starting values for the linear predictor.
#'
#' @param mustart (Optional) starting values for the vector of means.
#'
#' @param epsilon \code{Numeric} value of positive convergence tolerance ε; the
#' iterations converge when \eqn{|dev - dev_{old}|/(|dev| + 0.1) < ε}. Default
#' is \code{1e-8}.
#'
#' @param maxit \code{Integer} giving the maximal number of IWLS iterations.
#' Default is 25.
#'
#' @param skinny \code{Logical} value indicating whether or not to return a
#' "skinny" model. If \code{FALSE} (the default), the return object will include
#' a copy of the model \code{\link[stats]{terms}}, \code{data},
#' \code{contrasts}, and a record of the \code{xlevels} of the factors used in
#' fitting. If these features are not needed, setting \code{skinny = TRUE} will
#' prevent these copies from being made.
#'
#' @param n_cores Integer value indicating the number of workers to run in
#' parallel during subset search and cross-validation. By default, this will
#' be set to one fewer than the maximum number of physical cores you have
#' available, as indicated by \code{\link[parallel]{detectCores}}. Set to 1 to
#' disable parallel processing.
#'
#' @param parallel_type (Optional) character string indicating the type of
#' parallel operation to be used, either \code{"fork"} or \code{"sock"}. If
#' omitted and \code{n_cores > 1}, the default is \code{"sock"} for Windows and
#' otherwise either \code{"fork"} or \code{"sock"} depending on which process is
#' being run.
#'
#' @param cl (Optional) \code{\link[parallel]{parallel}} or
#' \code{\link[snow]{snow}} cluster for use if \code{parallel_type = "sock"}.
#' If not supplied, a cluster on the local machine is automatically created.
#'
#' @return A "beset_glm" object with the following components:
#' \describe{
#' \item{stats}{a list with three data frames:
#' \describe{
#' \item{fit}{statistics for every possible combination of predictors:
#' \describe{
#' \item{n_pred}{the total number of predictors in model; note that
#' the number of predictors for a factor variable corresponds to the
#' number of factor levels minus 1}
#' \item{form}{formula for model}
#' \item{aic}{\eqn{-2*log-likelihood + k*npar}, where \eqn{npar}
#' represents the number of parameters in the fitted model, and
#' \eqn{k = 2}}
#' \item{dev}{twice the difference between the log-likelihoods of the
#' saturated and fitted models, multiplied by the scale parameter}
#' \item{mae}{mean absolute error}
#' \item{mce}{mean cross entropy, estimated as
#' \eqn{-log-likelihood/N}, where \eqn{N} is the number of
#' observations}
#' \item{mse}{mean squared error}
#' \item{r2}{R-squared, calculated as
#' \eqn{1 - deviance/null deviance}}
#' }
#' }
#' \item{cv}{a data frame containing cross-validation statistics
#' for the best model for each \code{n_pred} listed in \code{fit_stats}.
#' Each metric is computed using \code{\link{predict_metrics}}, with
#' models fit to \eqn{n-1} folds and predictions made on the left-out fold.
#' Each metric is followed by its standard error. The data frame
#' is otherwise the same as that documented for \code{fit}, except
#' AIC is omitted.}
#' \item{test}{if \code{test_data} is provided, a data frame
#' containing prediction metrics for the best model for each \code{n_pred}
#' listed in \code{fit} as applied to the \code{test_data}.}
#' }
#' }
#' \item{fold_assignments}{list giving the row indices for the holdout
#' observations for each fold and/or repetition of cross-validation}
#' \item{n_folds}{number of folds used in cross-validation}
#' \item{n_reps}{number of repetitions used in cross-validation}
#' \item{family}{name of error distribution used in the model}
#' \item{link}{name of link function used in the model}
#' \item{terms}{the \code{\link[stats]{terms}} object used}
#' \item{data}{the \code{data} argument}
#' \item{offset}{the offset vector used}
#' \item{contrasts}{(where relevant) the contrasts used}
#' \item{xlevels}{(where relevant) a record of the levels of the factors used
#' in fitting}
#' }
#' @examples
#' subset1 <- beset_glm(Fertility ~ ., data = swiss)
#' summary(subset1)
#'
#' # Force variables to be included in model
#' subset2 <- beset_glm(Fertility ~ ., data = swiss,
#' force_in = c("Agriculture", "Examination"))
#' summary(subset2)
#'
#' # Use nested cross-validation to evaluate error in selection
#' subset3 <- beset_glm(Fertility ~ ., data = swiss, nest_cv = TRUE)
#' summary(subset3)
#' @rdname beset_glm
#' @export
beset_glm <- function(
form, data, family = "gaussian", link = NULL, p_max = 10, force_in = NULL,
nest_cv = FALSE, n_folds = 10, n_reps = 10, seed = 42, contrasts = NULL,
offset = NULL, weights = NULL, start = NULL, etastart = NULL, mustart = NULL,
epsilon = 1e-8, maxit = 25, skinny = FALSE, n_cores = NULL,
parallel_type = NULL, cl = NULL
){
#==================================================================
# Check family argument and identify appropriate model fit function
#------------------------------------------------------------------
family <- check_family(family)
fitter <- if(family == "negbin") "glm_nb" else "glm.fit"
#==================================================================
# Check for missing values and linear dependence
#------------------------------------------------------------------
if(inherits(data, "data.frame")){
data <- model.frame(form, data = data)
n_omit <- length(attr(data, "na.action"))
if(n_omit > 0){
warning(paste("Dropping", n_omit, "rows with missing data."),
immediate. = TRUE)
attr(data, "na.action") <- NULL
}
data <- check_lindep(data)
mf <- model.frame(form, data = data)
terms <- terms(mf)
xlevels <- .getXlevels(terms, mf)
# Set number of cross-validation folds and reps
n_obs <- nrow(mf)
cv_params <- set_cv_par(n_obs = n_obs, n_folds = n_folds, n_reps = n_reps,
nest_cv = nest_cv)
n_reps <- cv_params$n_reps; n_folds <- cv_params$n_folds
# Check that there are sufficient number of observations for p_max
train_size <- n_obs * (n_folds - 1) / n_folds
if(nest_cv) train_size <- train_size * (n_folds - 1) / n_folds
p_max <- check_pmax(p_max, train_size, length(attr(terms, "term.labels")))
} else if(inherits(data, "data_partition")){
mf <- model.frame(form, data = data$train)
terms <- terms(mf)
xlevels <- .getXlevels(terms, mf)
p_max <- check_pmax(p_max, nrow(mf), length(attr(terms, "term.labels")))
} else {
stop("`data` argument must inherit class 'data.frame' or 'data_partition'")
}
tryCatch(
if(length(attr(terms, "term.labels")) <= 1L){
stop(
paste(
"Your model only has 1 predictor, so no subsets to compare.\n",
"To cross-validate, just use `lm` or `glm` followed by `validate`."
)
)
},
error = function(c){
c$call <- NULL
stop(c)
}
)
#======================================================================
# Set up parallel operations
#----------------------------------------------------------------------
if(!is.null(cl)){
if(!inherits(cl, "cluster")) stop("Not a valid parallel socket cluster")
n_cores <- length(cl)
} else if(is.null(n_cores) || n_cores > 1){
if(nest_cv && is.null(parallel_type)) parallel_type <- "sock"
parallel_control <- setup_parallel(
parallel_type = parallel_type, n_cores = n_cores, cl = cl)
n_cores <- parallel_control$n_cores
cl <- parallel_control$cl
}
#======================================================================
# Recursive function for performing nested cross-validation
#----------------------------------------------------------------------
if(nest_cv){
if(inherits(data, "data_partition")){
tryCatch(
stop(paste("Ambiguous call: do you want to use the test partition found",
"in `data`` to estimate test performance, or do you want to",
"use nested cross-validation? Either set `nest_cv` to FALSE",
"or pass a single data frame instead of a train/test split.",
sep = "\n")),
error = function(c){
c$call <- NULL
stop(c)
})
}
o <- NULL; w <- NULL
if(!is.null(offset)){
data$offset = offset
o <- "offset"
}
if(!is.null(weights)){
data$weights = weights
w <- "weights"
}
y <- all.vars(form)[1]
x <- all.vars(form)[-1]
if(length(x) == 1 && x == ".") x <- NULL
fold_ids <- create_folds(data[[y]], n_folds, n_reps, seed)
all_partitions <- lapply(fold_ids, function(i){
suppressWarnings(
data_partition(train = data[-i,], test = data[i,], y = y, x = x,
offset = o, weights = w)
)
})
nested_cv <- if(n_cores > 1L){
if(is.null(cl)){
parallel::mclapply(all_partitions, function(x){
beset_glm(form = form, data = x, family = family, link = link,
p_max = p_max, force_in = force_in, contrasts = contrasts,
nest_cv = FALSE, n_folds = n_folds, n_reps = 1, seed = seed,
start = start, etastart = etastart, mustart = mustart,
epsilon = epsilon, maxit = maxit, n_cores = 1)
})
} else {
parallel::parLapply(cl, all_partitions, function(x){
beset_glm(form = form, data = x, family = family, link = link,
p_max = p_max, force_in = force_in, contrasts = contrasts,
nest_cv = FALSE, n_folds = n_folds, n_reps = 1, seed = seed,
start = start, etastart = etastart, mustart = mustart,
epsilon = epsilon, maxit = maxit, n_cores = 1)
})
}
} else {
lapply(all_partitions, function(x){
beset_glm(form = form, data = x, family = family, link = link,
p_max = p_max, force_in = force_in, contrasts = contrasts,
nest_cv = FALSE, n_folds = n_folds, n_reps = 1, seed = seed,
start = start, etastart = etastart, mustart = mustart,
epsilon = epsilon, maxit = maxit, n_cores = 1)
})
}
if(skinny){
terms <- data <- contrasts <- xlevels <- NULL
}
out <- structure(
list(
beset = nested_cv, fold_assignments = fold_ids,
n_folds = n_folds, n_reps = n_reps,
family = family, link = link, terms = terms, data = data,
offset = offset, contrasts = contrasts, xlevels = xlevels
),
class = c("nested", "beset", "glm")
)
if(!is.null(cl)) stopCluster(cl)
return(out)
}
#==================================================================
# Create list of arguments for model
#------------------------------------------------------------------
m <- make_args(form = form, data = data, family = family, link = link,
contrasts = contrasts, weights = weights, offset = offset,
start = start, etastart = etastart, mustart = mustart,
epsilon = epsilon, maxit = maxit, force_in = force_in)
#==================================================================
#======================================================================
# Get all subsets (see utils_subset.R)
#----------------------------------------------------------------------
all_subsets <- get_subsets(m, p_max)
#======================================================================
# Obtain fit statistics for every model
#----------------------------------------------------------------------
train_test_stats <- if (n_cores > 1L) {
if(is.null(cl)){
parallel::mclapply(all_subsets$pred_idx, get_subset_stats, m = m,
fitter= fitter, mc.cores = n_cores)
} else {
parallel::parLapply(cl, all_subsets$pred_idx, get_subset_stats, m = m,
fitter = fitter)
}
} else lapply(all_subsets$pred_idx, get_subset_stats, m = m, fitter = fitter)
fit_stats <- transpose(train_test_stats)$fit_stats %>%
transpose %>% simplify_all %>% as_tibble %>%
mutate_all(function(x) case_when(abs(x) < 1e-6 ~ 0, TRUE ~ x))
fit_stats <- dplyr::bind_cols(all_subsets, dplyr::as_tibble(fit_stats))
test_stats <- NULL
test_preds <- NULL
if(!is.null(train_test_stats[[1]]$test_stats)){
test_stats <- transpose(train_test_stats)$test_stats %>%
transpose %>% simplify_all %>% as_tibble %>%
mutate_all(function(x) case_when(abs(x) < 1e-6 ~ 0, TRUE ~ x))
test_stats <- bind_cols(all_subsets, as_tibble(test_stats))
test_preds <- transpose(train_test_stats)$test_preds
}
m$test$yhat <- test_preds
#======================================================================
# Obtain model with best fit for each number of parameters
#----------------------------------------------------------------------
cv_stats <- group_by(fit_stats, n_pred) %>%
filter(rsq == max(rsq)) %>%
ungroup() %>%
select(pred_idx:n_pred) %>%
arrange(n_pred)
best_fits <- lapply(cv_stats$pred_idx, function(j){
model <- c(list(x = m$train$x[, j, drop = FALSE]), m$train[-1])
out <- do.call(fitter, model)
out <- c(out, model[c("x", "offset", "control", "intercept")])
class(out) <- c("glm", "lm")
if(family == "negbin") class(out) <- c("negbin", class(out))
out
})
#======================================================================
# Perform cross-validation on best models
#----------------------------------------------------------------------
cv_results <- if (n_cores > 1L) {
if(is.null(cl)){
parallel::mclapply(best_fits, validate, n_folds = n_folds,
n_reps = n_reps, seed = seed, mc.cores = n_cores,
silent = TRUE)
} else {
parallel::parLapply(cl, best_fits, validate, n_folds = n_folds,
n_reps = n_reps, seed = seed, silent = TRUE)
}
} else map(best_fits, ~ validate(., n_folds = n_folds, n_reps = n_reps,
seed = seed, silent = TRUE)
)
cv_results <- cv_results %>% map("stats") %>% transpose %>% as_tibble
cv_stats <- bind_cols(cv_stats, cv_results)
#======================================================================
# Construct beset_glm object
#----------------------------------------------------------------------
stats <- list(fit = fit_stats, cv = cv_stats, test = test_stats)
parameters <- m$train
if(skinny){
terms <- data <- contrasts <- xlevels <- fold_ids <- NULL
} else {
fold_ids <- create_folds(m$train$y, n_folds, n_reps, seed)
}
if(!is.null(cl)) stopCluster(cl)
structure(
list(stats = stats,
parameters = parameters,
fold_assignments = fold_ids, n_folds = n_folds, n_reps = n_reps,
family = family, link = link, terms = terms, data = data,
offset = offset, contrasts = contrasts, xlevels = xlevels),
class = c("beset", "glm")
)
}
#' @export
#' @rdname beset_glm
beset_lm <- function(form, data, p_max = 10, force_in = NULL,
weights = NULL, contrasts = NULL, offset = NULL,
nest_cv = FALSE, n_folds = 10, n_reps = 10, seed = 42,
n_cores = NULL, parallel_type = NULL, cl = NULL){
do.call(beset_glm, as.list(match.call())[-1])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.