library(learnr)
library(gradethis)
library(knitr)

tutorial_options(exercise.timelimit = 5, exercise.checker = gradethis::grade_learnr)

knitr::opts_chunk$set(echo = FALSE)

Lasso Regression (small p)

Generating the Data

Generate data from a linear model without intercept. Let $X_{ij}$ and $\epsilon_i$ be iid standard Gaussian. Set $p=30$ and $n=100$. Set the first 5 $\beta_j = 1$ ($j=1,\ldots,5$), the rest are zero.

library(glmnet)
set.seed(20181023)
n=___
p=___
X = matrix(rnorm(n*p),n)
beta = _______________
Y = ________________
Y
library(glmnet)
set.seed(20181023)
n=100
p=30
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))

grade_result(
  pass_if(~ identical(.result, Y), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

Analysis

Fit LASSO to this data set. Calculate CV, AIC, and BIC. One can show that an unbiased estimator of the degrees of freedom is the number of selected covariates. Use the "unknown variance" version, ($\log(RSS)$)

library(glmnet)
set.seed(20181023)
n=100
p=30
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))
aic <- function(cvglmnetobj, Y,  X){
  like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  result = _______________________________
  return(result)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  result = ________________________________
  return(result)
}

If done correctly, your functions should look like:

aic <- function(cvglmnetobj, Y,  X){
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}

These have been preloaded for you to use for the rest of the exercises.

Compute AIC, BIC, and CV

Run the following code that finds the best model using AIC, BIC, and CV.

library(glmnet)
set.seed(20181023)
n=100
p=30
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))
rss <- function(cvglmnetobj, Y, X){
  colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
}

aic <- function(cvglmnetobj, Y,  X){
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}
out = cv.glmnet(X,Y, intercept=FALSE)
bestmodels = list(
  AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))],
  BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))],
  CV = c(coef(out, s='lambda.min')[-1,])
)

For each model selection criterion, how many of the correct coefficients do you select? How many incorrect? Fill in the missing code.

library(glmnet)
set.seed(20181023)
n=100
p=30
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))
rss <- function(cvglmnetobj, Y, X){
  colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
}

aic <- function(cvglmnetobj, Y,  X){
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}
out = cv.glmnet(X,Y, intercept=FALSE)
bestmodels = list(
  AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))],
  BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))],
  CV = c(coef(out, s='lambda.min')[-1,])
)
errs <- function(b,beta){
  TP = _________________ # True Positives
  FP = _________________ # False Positives
  c(TP,FP)
}

Your function should look similar to the following function:

errs <- function(b,beta){
  TP = sum((abs(b) > 0 & abs(beta)>0)) # True Positives
  FP = sum((abs(b)>0 & beta==0)) # False Positives
  c(TP,FP)
}
library(glmnet)
set.seed(20181023)
n=100
p=30
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))
rss <- function(cvglmnetobj, Y, X){
  colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
}

aic <- function(cvglmnetobj, Y,  X){
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}
out = cv.glmnet(X,Y, intercept=FALSE)
bestmodels = list(
  AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))],
  BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))],
  CV = c(coef(out, s='lambda.min')[-1,])
)
errs <- function(b,beta){
  TP = sum((abs(b) > 0 & abs(beta)>0)) # True Positives
  FP = sum((abs(b)>0 & beta==0)) # False Positives
  c(TP,FP)
}

The following code will create a nice table showing the accuracy of each of the methods.

tab = sapply(bestmodels,errs,beta=beta)
rownames(tab) = c('true positives','false positives')
kable(tab)

quiz(caption="Questions",
  question("Which method had the lowest number of false positives?",
    answer("AIC"),
    answer("BIC", correct=TRUE),
    answer("CV"),
    answer("All the same"),
    allow_retry = TRUE,
    correct = paste0("Correct!")
  ),
  question("Which method had the highest number of true positives?",
    answer("AIC"),
    answer("BIC"),
    answer("CV"),
    answer("All the same.",correct = TRUE),
    allow_retry = TRUE,
    correct = paste0("Correct!")
  )
)

Lasso Regression (large p)

Repeat the same exercise but using $p=300$. The following functions have been saved and preloaded for your convenience:

aic <- function(cvglmnetobj, Y,  X){
  like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}
errs <- function(b,beta){
  TP = sum((abs(b) > 0 & abs(beta)>0))
  FP = sum((abs(b)>0 & beta==0))
  c(TP,FP)
}
rss <- function(cvglmnetobj, Y, X){
  colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
}

aic <- function(cvglmnetobj, Y,  X){
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = rss(cvglmnetobj, Y,  X)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}
errs <- function(b,beta){
  TP = sum((abs(b) > 0 & abs(beta)>0))
  FP = sum((abs(b)>0 & beta==0))
  c(TP,FP)
}

library(glmnet)
set.seed(20181023)
n=100
p=300
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))
Y
library(glmnet)
set.seed(20181023)
n=100
p=_____
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))

grade_result(
  pass_if(~ identical(.result, Y), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

aic <- function(cvglmnetobj, Y,  X){
  like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)/length(Y)
}

bic <- function(cvglmnetobj, Y,  X){
  n = length(Y)
  like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2)
  df = colSums(abs(out$glmnet.fit$beta) > 0 )
  log(like) + 2*(df+1)*log(n)/n
}
errs <- function(b,beta){
  TP = sum((abs(b) > 0 & abs(beta)>0))
  FP = sum((abs(b)>0 & beta==0))
  c(TP,FP)
}
library(glmnet)
set.seed(20181023)
n=100
p=300
X = matrix(rnorm(n*p),n)
beta = c(rep(1,5),rep(0,p-5))
Y = c(X %*% beta + rnorm(n))

Run the following code to produce a table of values that shows the accuracy of the AIC, BIC, and CV methods.

out = cv.glmnet(X,Y, intercept=FALSE)
bestmodels = list(
  AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))],
  BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))],
  CV = c(coef(out, s='lambda.min')[-1,])
)
tab = sapply(bestmodels,errs,beta=beta)
rownames(tab) = c('true positives','false positives')
kable(tab)

quiz(caption="Questions",
  question("Which method had the lowest number of false positives?",
    answer("AIC"),
    answer("BIC", correct=TRUE),
    answer("CV"),
    answer("All the same"),
    allow_retry = TRUE,
    correct = paste0("Correct!")
  ),
  question("Which method had the highest number of true positives?",
    answer("AIC"),
    answer("BIC"),
    answer("CV"),
    answer("All the same.",correct = TRUE),
    allow_retry = TRUE,
    correct = paste0("Correct!")
  )
)

What do you notice?

Did you notice any difference between the low and large number of predictors?

There's not a huge difference in low dimensions. AIC and CV are essentially the same, picking slightly larger models than BIC. With $p=300$, AIC drastically over-selects (it chooses the largest model it can, based on the path). BIC does quite well, CV slightly over-selects. In all cases, the true coefficients are always included (the signal-to-noise ratio is pretty large).



dajmcdon/ubc-stat406-labs documentation built on Aug. 18, 2020, 1:23 p.m.