knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette walks through the code used to produce the examples of statistical regularization shown in the paper. We use the butterfly
data from Roland and Matter 2016 (Ecological Monographs, data here). As mentioned in the paper, we focus on ridge regression, the least absolute shrinkage and selector operator (LASSO), and the elastic net, but many other regulators exist, and interested readers should peruse the machine learning literature. Outside of R, there is a comphrensive python library called scikit-learn with many model selection routines.
In what follows we use the glmnet
package (webpage, CRAN) and show one example of the LASSO for mixed effects models using the glmmLasso
package (CRAN).
# Set seed and load packages --------------------------------------------------- set.seed(12300) library(modselr) library(tidyverse) library(glmnet) # package for statistical regularization # Subset out fitting data ------------------------------------------------------ # Get one subpopulation sub_pop <- butterfly %>% filter(meada == 'L', year < 2014) # Subset out just the important variables from Roland & Matter (2016) Figure 3 imp_vars <- c("novextmax", "julraintmn1", "novextmin", "novmean", "decextmax", "novmeanmax", "logNt", "novmeanmin") # Prepare data for glmnet ------------------------------------------------------ y <- sub_pop$Rt # population growth rate X <- as.matrix(select(sub_pop, imp_vars)) # covariate matrix # Make sure there are no NAs in the covariate matrix test_vec <- as.numeric(apply(X, MARGIN = 2, FUN = "mean")) nacols <- which(is.na(test_vec)) if(length(nacols) > 0) X <- X[,-nacols] # Standaradize the covariate values X <- scale(X, center = TRUE, scale = TRUE) # Run ridge regression --------------------------------------------------------- pen_facts <- rep(1, ncol(X)) # penalize all covariates lambdas <- 10^seq(2, -2, by = -.005) # sequence of penalties to test ridge_out <- cv.glmnet(x = X, y = y, lambda = lambdas, penalty.factor = pen_facts, family = "gaussian", alpha = 0, # 0 for ridge, 1 for lasso, 0.5 for elastic net standardize = FALSE, type.measure = "mse", # mean square error nfolds = 6) # for cross-validation # Collect results into data frames --------------------------------------------- lambdas <- ridge_out$lambda cv_scores <- ridge_out$cvm all_coefs <- as.data.frame(as.matrix(t(ridge_out$glmnet.fit$beta)[,1:ncol(X)])) colnames(all_coefs) <- colnames(X) all_coefs <- all_coefs %>% mutate(lambda = log(lambdas)) %>% gather(covariate, value, -lambda) mse_df <- data.frame(lambda = log(lambdas), score = cv_scores) best_lambda <- min(mse_df$lambda[which(mse_df$score == min(mse_df$score))]) # Plot the regularization paths ------------------------------------------------ # These are 'modselr' functions for plotting make_coef_plot(coef_df = all_coefs, best_lambda = best_lambda, style = "base") + ggtitle("Ridge regression: coefficient paths") make_cvscore_plot(cvscore_df = mse_df, score_name = "MSE", best_lambda = best_lambda, style = "base") + ggtitle("Ridge regression: MSE path")
# Run LASSO regression --------------------------------------------------------- lasso_out <- cv.glmnet(x = X, y = y, lambda = lambdas, penalty.factor = pen_facts, family = "gaussian", alpha = 1, # 0 for ridge, 1 for lasso, 0.5 for elastic net standardize = FALSE, type.measure = "mse", nfolds = 6) # Collect results into data frames --------------------------------------------- lambdas <- lasso_out$lambda cv_scores <- lasso_out$cvm all_coefs <- as.data.frame(as.matrix(t(lasso_out$glmnet.fit$beta)[,1:ncol(X)])) colnames(all_coefs) <- colnames(X) all_coefs <- all_coefs %>% mutate(lambda = log(lambdas)) %>% gather(covariate, value, -lambda) mse_df <- data.frame(lambda = log(lambdas), score = cv_scores) best_lambda <- min(mse_df$lambda[which(mse_df$score == min(mse_df$score))]) # Plot the regularization paths ------------------------------------------------ # These are 'modselr' functions for plotting make_coef_plot(coef_df = all_coefs, best_lambda = best_lambda, style = "base") + ggtitle("LASSO: coefficient paths") make_cvscore_plot(cvscore_df = mse_df, score_name = "MSE", best_lambda = best_lambda, style = "base") + ggtitle("LASSO: MSE path")
# Run Elastic Net regression --------------------------------------------------- enet_out <- cv.glmnet(x = X, y = y, lambda = lambdas, penalty.factor = pen_facts, family = "gaussian", alpha = 0.5, # 0 for ridge, 1 for lasso standardize = FALSE, type.measure = "mse", nfolds = 6) # Collect results into data frames --------------------------------------------- lambdas <- enet_out$lambda cv_scores <- enet_out$cvm all_coefs <- as.data.frame(as.matrix(t(enet_out$glmnet.fit$beta)[,1:ncol(X)])) colnames(all_coefs) <- colnames(X) all_coefs <- all_coefs %>% mutate(lambda = log(lambdas)) %>% gather(covariate, value, -lambda) mse_df <- data.frame(lambda = log(lambdas), score = cv_scores) best_lambda <- min(mse_df$lambda[which(mse_df$score == min(mse_df$score))]) # Plot the regularization paths ------------------------------------------------ # These are 'modselr' functions for plotting make_coef_plot(coef_df = all_coefs, best_lambda = best_lambda, style = "base") + ggtitle("Elastic Net: coefficient paths") make_cvscore_plot(cvscore_df = mse_df, score_name = "MSE", best_lambda = best_lambda, style = "base") + ggtitle("Elastic Net: MSE path")
As we discussed in the main text, statistical regularization techniques for mixed-effects, or hierarchical, models are still in active development. It seems that LASSO methods have developed most rapidly -- there is even an R package, glmmLASSO
. We use the package to fit and regularize a mixed effects model where there is a random intercept effect of meadow (meada
in the butterfly
dataframe).
Note that glmmLasso
does not include a cross-validation routine for tuning the penalty parameter (lambda
in the function). So we built our own CV function for glmmLasso
, which we use here to create path plots similar to those above and to choose the optimal penalty.
library(glmmLasso) # Format data and define CV settings ------------------------------------------- lambdas <- 10^seq(2, -2, by = -.1) # sequence of penalties to test nlambdas <- length(lambdas) all_vars <- c("year", "Rt", "meada", imp_vars) # add year, growth rate, and meadow to var list fitting_tmp <- butterfly %>% filter(year < 2014) %>% select(all_vars) %>% mutate( meada = as.factor(meada) ) fitting_df <- purrr::map_df(seq_len(nlambdas), ~fitting_tmp) %>% mutate( lambda = rep(lambdas, each = nrow(fitting_tmp)), lambda_in = lambda ) # Define folds of data, by year ------------------------------------------------ all_years <- unique(fitting_df$year) nfolds <- 6 folds <- cut(seq(1,length(all_years)), breaks = nfolds, labels = FALSE) # get folds folds_sharp <- sample(folds, length(folds)) # shuffle folds folds_df <- data.frame(year = all_years, fold_id = folds_sharp) # Define model function -------------------------------------------------------- lasso_model <- function(df) { L1 <- unique(df$lambda_in) glmmLasso(fix = Rt ~ novextmax+ julraintmn1+ novextmin+ novmean+ decextmax+ novmeanmax+ logNt+ novmeanmin, rnd = list(meada =~ 1), data = df, lambda = L1, family = gaussian(link="identity"), switch.NR = FALSE, final.re = FALSE, control = list()) } # Fit models within CV --------------------------------------------------------- mse_df <- {} for(ifold in 1:nfolds) { iyear <- folds_df %>% filter(fold_id == ifold) %>% pull(year) fitter <- fitting_df %>% filter(!(year %in% iyear)) %>% group_by(lambda) %>% nest() models <- fitter %>% mutate( model = data %>% map(lasso_model) ) model_dets <- data.frame( matrix( unlist( map(models$model, coef) ), nrow = nlambdas, byrow = TRUE ) ) colnames(model_dets) <- names(map(models$model, coef)[[1]]) colnames(model_dets)[1] <- "intercept" model_dets <- model_dets %>% mutate( lambda = lambdas ) tester <- fitting_df %>% filter(year %in% iyear) %>% select(-lambda_in) %>% left_join(model_dets, by = c("lambda")) %>% mutate( Rt_hat = intercept + novextmax.x*novextmax.y + julraintmn1.x*julraintmn1.y + novextmin.x*novextmin.y + novmean.x*novmean.y + decextmax.x*decextmax.y + novmeanmax.x*novmeanmax.y + logNt.x*logNt.y + novmeanmin.x*novmeanmin.y, square_error = (Rt - Rt_hat)^2 ) %>% group_by(lambda) %>% summarise(mse = mean(square_error)) %>% mutate(fold_id = ifold) mse_df <- rbind(mse_df, tester) } mse_df %>% group_by(lambda) %>% summarise(avg_mse = mean(mse)) %>% ggplot(aes(x = log(lambda), y = avg_mse))+ geom_line()+ xlab(expression(log(lambda)))+ ylab("Cross-validation MSE") # Find lambda at minimum MSE and refit opt_lambda <- mse_df %>% group_by(lambda) %>% summarise(avg_mse = mean(mse)) %>% filter(avg_mse == min(avg_mse)) %>% pull(lambda) opt_fit <- glmmLasso(fix = Rt ~ novextmax+ julraintmn1+ novextmin+ novmean+ decextmax+ novmeanmax+ logNt+ novmeanmin, rnd = list(meada =~ 1), data = fitting_df, lambda = opt_lambda, family = gaussian(link="identity"), switch.NR = FALSE, final.re = FALSE, control = list()) coef_df <- data.frame( Covariate = names(opt_fit$coefficients), Estimate = as.numeric(opt_fit$coefficients) ) knitr::kable(coef_df, digits = 2, caption = "Coefficient estimates from `glmmLasso` with the optimal penalty from cross-validation.")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.