knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

This vignette walks through the code used to produce the examples of statistical regularization shown in the paper. We use the butterfly data from Roland and Matter 2016 (Ecological Monographs, data here). As mentioned in the paper, we focus on ridge regression, the least absolute shrinkage and selector operator (LASSO), and the elastic net, but many other regulators exist, and interested readers should peruse the machine learning literature. Outside of R, there is a comphrensive python library called scikit-learn with many model selection routines. In what follows we use the glmnet package (webpage, CRAN) and show one example of the LASSO for mixed effects models using the glmmLasso package (CRAN).

Ridge regression

# Set seed and load packages ---------------------------------------------------
set.seed(12300)
library(modselr)
library(tidyverse)
library(glmnet) # package for statistical regularization


# Subset out fitting data ------------------------------------------------------
# Get one subpopulation
sub_pop <- butterfly %>%
  filter(meada == 'L', year < 2014)

# Subset out just the important variables from Roland & Matter (2016) Figure 3
imp_vars <- c("novextmax",
              "julraintmn1",
              "novextmin",
              "novmean",
              "decextmax",
              "novmeanmax",
              "logNt",
              "novmeanmin")


# Prepare data for glmnet ------------------------------------------------------
y <- sub_pop$Rt # population growth rate
X <- as.matrix(select(sub_pop, imp_vars)) # covariate matrix

# Make sure there are no NAs in the covariate matrix
test_vec <- as.numeric(apply(X, MARGIN = 2, FUN = "mean"))
nacols <- which(is.na(test_vec))
if(length(nacols) > 0) X <- X[,-nacols]

# Standaradize the covariate values
X <- scale(X, center = TRUE, scale = TRUE)


# Run ridge regression ---------------------------------------------------------
pen_facts <- rep(1, ncol(X)) # penalize all covariates
lambdas <- 10^seq(2, -2, by = -.005) # sequence of penalties to test

ridge_out <- cv.glmnet(x = X, 
                       y = y, 
                       lambda = lambdas,
                       penalty.factor = pen_facts,
                       family = "gaussian", 
                       alpha = 0, # 0 for ridge, 1 for lasso, 0.5 for elastic net 
                       standardize = FALSE, 
                       type.measure = "mse", # mean square error
                       nfolds = 6) # for cross-validation


# Collect results into data frames ---------------------------------------------
lambdas <- ridge_out$lambda
cv_scores <- ridge_out$cvm
all_coefs <- as.data.frame(as.matrix(t(ridge_out$glmnet.fit$beta)[,1:ncol(X)]))
colnames(all_coefs) <- colnames(X)
all_coefs <- all_coefs %>%
  mutate(lambda = log(lambdas)) %>%
  gather(covariate, value, -lambda)

mse_df <- data.frame(lambda = log(lambdas),
                     score = cv_scores)

best_lambda <- min(mse_df$lambda[which(mse_df$score == min(mse_df$score))])


# Plot the regularization paths ------------------------------------------------
# These are 'modselr' functions for plotting
make_coef_plot(coef_df = all_coefs, 
               best_lambda = best_lambda, 
               style = "base") + ggtitle("Ridge regression: coefficient paths")

make_cvscore_plot(cvscore_df = mse_df, 
                  score_name = "MSE", 
                  best_lambda = best_lambda, 
                  style = "base") + ggtitle("Ridge regression: MSE path")

Least Absolute Shrinkage and Selector Operator (LASSO)

# Run LASSO regression ---------------------------------------------------------
lasso_out <- cv.glmnet(x = X, 
                       y = y, 
                       lambda = lambdas,
                       penalty.factor = pen_facts,
                       family = "gaussian", 
                       alpha = 1, # 0 for ridge, 1 for lasso, 0.5 for elastic net 
                       standardize = FALSE, 
                       type.measure = "mse",
                       nfolds = 6)


# Collect results into data frames ---------------------------------------------
lambdas <- lasso_out$lambda
cv_scores <- lasso_out$cvm
all_coefs <- as.data.frame(as.matrix(t(lasso_out$glmnet.fit$beta)[,1:ncol(X)]))
colnames(all_coefs) <- colnames(X)
all_coefs <- all_coefs %>%
  mutate(lambda = log(lambdas)) %>%
  gather(covariate, value, -lambda)

mse_df <- data.frame(lambda = log(lambdas),
                     score = cv_scores)
best_lambda <- min(mse_df$lambda[which(mse_df$score == min(mse_df$score))])

# Plot the regularization paths ------------------------------------------------
# These are 'modselr' functions for plotting
make_coef_plot(coef_df = all_coefs, 
               best_lambda = best_lambda, 
               style = "base") + ggtitle("LASSO: coefficient paths")

make_cvscore_plot(cvscore_df = mse_df, 
                  score_name = "MSE", 
                  best_lambda = best_lambda, 
                  style = "base") + ggtitle("LASSO: MSE path")

Elastic net

# Run Elastic Net regression ---------------------------------------------------
enet_out <- cv.glmnet(x = X, 
                       y = y, 
                       lambda = lambdas,
                       penalty.factor = pen_facts,
                       family = "gaussian", 
                       alpha = 0.5, # 0 for ridge, 1 for lasso 
                       standardize = FALSE, 
                       type.measure = "mse",
                       nfolds = 6)


# Collect results into data frames ---------------------------------------------
lambdas <- enet_out$lambda
cv_scores <- enet_out$cvm
all_coefs <- as.data.frame(as.matrix(t(enet_out$glmnet.fit$beta)[,1:ncol(X)]))
colnames(all_coefs) <- colnames(X)
all_coefs <- all_coefs %>%
  mutate(lambda = log(lambdas)) %>%
  gather(covariate, value, -lambda)

mse_df <- data.frame(lambda = log(lambdas),
                     score = cv_scores)
best_lambda <- min(mse_df$lambda[which(mse_df$score == min(mse_df$score))])


# Plot the regularization paths ------------------------------------------------
# These are 'modselr' functions for plotting
make_coef_plot(coef_df = all_coefs, 
               best_lambda = best_lambda, 
               style = "base") + ggtitle("Elastic Net: coefficient paths")

make_cvscore_plot(cvscore_df = mse_df, 
                  score_name = "MSE", 
                  best_lambda = best_lambda, 
                  style = "base") + ggtitle("Elastic Net: MSE path")

Mixed-effects LASSO

As we discussed in the main text, statistical regularization techniques for mixed-effects, or hierarchical, models are still in active development. It seems that LASSO methods have developed most rapidly -- there is even an R package, glmmLASSO. We use the package to fit and regularize a mixed effects model where there is a random intercept effect of meadow (meada in the butterfly dataframe).

Note that glmmLasso does not include a cross-validation routine for tuning the penalty parameter (lambda in the function). So we built our own CV function for glmmLasso, which we use here to create path plots similar to those above and to choose the optimal penalty.

library(glmmLasso)

# Format data and define CV settings -------------------------------------------

lambdas  <- 10^seq(2, -2, by = -.1) # sequence of penalties to test
nlambdas <- length(lambdas)

all_vars <- c("year", "Rt", "meada", imp_vars) # add year, growth rate, and meadow to var list

fitting_tmp <- butterfly %>%
  filter(year < 2014) %>%
  select(all_vars) %>%
  mutate(
    meada = as.factor(meada)
    )

fitting_df <- purrr::map_df(seq_len(nlambdas), ~fitting_tmp) %>%
  mutate(
    lambda = rep(lambdas, each = nrow(fitting_tmp)),
    lambda_in = lambda
  ) 

# Define folds of data, by year ------------------------------------------------
all_years <- unique(fitting_df$year)

nfolds <- 6
folds  <- cut(seq(1,length(all_years)),
              breaks = nfolds,
              labels = FALSE) # get folds

folds_sharp <- sample(folds, length(folds)) # shuffle folds
folds_df    <- data.frame(year = all_years,
                          fold_id = folds_sharp)

# Define model function --------------------------------------------------------

lasso_model <- function(df) {
  L1 <- unique(df$lambda_in)
  glmmLasso(fix       = Rt ~ novextmax+
                             julraintmn1+
                             novextmin+
                             novmean+
                             decextmax+
                             novmeanmax+
                             logNt+
                             novmeanmin,
            rnd       = list(meada =~ 1),
            data      = df,
            lambda    = L1,
            family    = gaussian(link="identity"),
            switch.NR = FALSE,
            final.re  = FALSE,
            control   = list())
}

# Fit models within CV ---------------------------------------------------------

mse_df <- {}

for(ifold in 1:nfolds) {
  iyear <- folds_df %>% 
    filter(fold_id == ifold) %>% 
    pull(year)

  fitter <- fitting_df %>% 
    filter(!(year %in% iyear)) %>%
  group_by(lambda) %>%
  nest()

  models <- fitter %>%
    mutate(
      model  = data %>% map(lasso_model)
    )

  model_dets <- data.frame(
    matrix(
      unlist(
        map(models$model, coef)
        ), 
      nrow = nlambdas, 
      byrow = TRUE
      )
    )
  colnames(model_dets) <- names(map(models$model, coef)[[1]])
  colnames(model_dets)[1] <- "intercept"

  model_dets <- model_dets %>%
    mutate(
      lambda = lambdas
    )

  tester <- fitting_df %>%
    filter(year %in% iyear) %>%
    select(-lambda_in) %>%
    left_join(model_dets, by = c("lambda")) %>%
    mutate(
      Rt_hat = intercept +
               novextmax.x*novextmax.y +
               julraintmn1.x*julraintmn1.y +
               novextmin.x*novextmin.y +
               novmean.x*novmean.y +
               decextmax.x*decextmax.y +
               novmeanmax.x*novmeanmax.y +
               logNt.x*logNt.y +
               novmeanmin.x*novmeanmin.y,
      square_error = (Rt - Rt_hat)^2
    ) %>%
    group_by(lambda) %>%
    summarise(mse = mean(square_error)) %>%
    mutate(fold_id = ifold)

  mse_df <- rbind(mse_df, tester)
}

mse_df %>%
  group_by(lambda) %>%
  summarise(avg_mse = mean(mse)) %>%
  ggplot(aes(x = log(lambda), y = avg_mse))+
    geom_line()+
    xlab(expression(log(lambda)))+
    ylab("Cross-validation MSE")

# Find lambda at minimum MSE and refit
opt_lambda <- mse_df %>%
  group_by(lambda) %>%
  summarise(avg_mse = mean(mse)) %>%
  filter(avg_mse == min(avg_mse)) %>%
  pull(lambda)

opt_fit <- glmmLasso(fix = Rt ~ novextmax+
                             julraintmn1+
                             novextmin+
                             novmean+
                             decextmax+
                             novmeanmax+
                             logNt+
                             novmeanmin,
            rnd       = list(meada =~ 1),
            data      = fitting_df,
            lambda    = opt_lambda,
            family    = gaussian(link="identity"),
            switch.NR = FALSE,
            final.re  = FALSE,
            control   = list())

coef_df <- data.frame(
  Covariate = names(opt_fit$coefficients),
  Estimate = as.numeric(opt_fit$coefficients)
)

knitr::kable(coef_df, 
             digits = 2,
             caption = "Coefficient estimates from `glmmLasso` with the optimal penalty from cross-validation.")


atredennick/modselr documentation built on Dec. 11, 2020, 10:09 p.m.