library(learnr) library(gradethis) library(knitr) tutorial_options(exercise.timelimit = 5, exercise.checker = gradethis::grade_learnr) knitr::opts_chunk$set(echo = FALSE)
Generate data from a linear model without intercept. Let $X_{ij}$ and $\epsilon_i$ be iid standard Gaussian. Set $p=30$ and $n=100$. Set the first 5 $\beta_j = 1$ ($j=1,\ldots,5$), the rest are zero.
library(glmnet) set.seed(20181023) n=___ p=___ X = matrix(rnorm(n*p),n) beta = _______________ Y = ________________ Y
library(glmnet) set.seed(20181023) n=100 p=30 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n)) grade_result( pass_if(~ identical(.result, Y), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
Fit LASSO to this data set. Calculate CV, AIC, and BIC. One can show that an unbiased estimator of the degrees of freedom is the number of selected covariates. Use the "unknown variance" version, ($\log(RSS)$)
library(glmnet) set.seed(20181023) n=100 p=30 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n))
aic <- function(cvglmnetobj, Y, X){ like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) df = colSums(abs(out$glmnet.fit$beta) > 0 ) result = _______________________________ return(result) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) df = colSums(abs(out$glmnet.fit$beta) > 0 ) result = ________________________________ return(result) }
If done correctly, your functions should look like:
aic <- function(cvglmnetobj, Y, X){ like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n }
These have been preloaded for you to use for the rest of the exercises.
Run the following code that finds the best model using AIC, BIC, and CV.
library(glmnet) set.seed(20181023) n=100 p=30 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n)) rss <- function(cvglmnetobj, Y, X){ colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) } aic <- function(cvglmnetobj, Y, X){ like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n }
out = cv.glmnet(X,Y, intercept=FALSE) bestmodels = list( AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))], BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))], CV = c(coef(out, s='lambda.min')[-1,]) )
For each model selection criterion, how many of the correct coefficients do you select? How many incorrect? Fill in the missing code.
library(glmnet) set.seed(20181023) n=100 p=30 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n)) rss <- function(cvglmnetobj, Y, X){ colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) } aic <- function(cvglmnetobj, Y, X){ like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n } out = cv.glmnet(X,Y, intercept=FALSE) bestmodels = list( AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))], BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))], CV = c(coef(out, s='lambda.min')[-1,]) )
errs <- function(b,beta){ TP = _________________ # True Positives FP = _________________ # False Positives c(TP,FP) }
Your function should look similar to the following function:
errs <- function(b,beta){ TP = sum((abs(b) > 0 & abs(beta)>0)) # True Positives FP = sum((abs(b)>0 & beta==0)) # False Positives c(TP,FP) }
library(glmnet) set.seed(20181023) n=100 p=30 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n)) rss <- function(cvglmnetobj, Y, X){ colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) } aic <- function(cvglmnetobj, Y, X){ like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n } out = cv.glmnet(X,Y, intercept=FALSE) bestmodels = list( AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))], BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))], CV = c(coef(out, s='lambda.min')[-1,]) ) errs <- function(b,beta){ TP = sum((abs(b) > 0 & abs(beta)>0)) # True Positives FP = sum((abs(b)>0 & beta==0)) # False Positives c(TP,FP) }
The following code will create a nice table showing the accuracy of each of the methods.
tab = sapply(bestmodels,errs,beta=beta) rownames(tab) = c('true positives','false positives') kable(tab)
quiz(caption="Questions", question("Which method had the lowest number of false positives?", answer("AIC"), answer("BIC", correct=TRUE), answer("CV"), answer("All the same"), allow_retry = TRUE, correct = paste0("Correct!") ), question("Which method had the highest number of true positives?", answer("AIC"), answer("BIC"), answer("CV"), answer("All the same.",correct = TRUE), allow_retry = TRUE, correct = paste0("Correct!") ) )
Repeat the same exercise but using $p=300$. The following functions have been saved and preloaded for your convenience:
aic <- function(cvglmnetobj, Y, X){ like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n } errs <- function(b,beta){ TP = sum((abs(b) > 0 & abs(beta)>0)) FP = sum((abs(b)>0 & beta==0)) c(TP,FP) }
rss <- function(cvglmnetobj, Y, X){ colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) } aic <- function(cvglmnetobj, Y, X){ like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = rss(cvglmnetobj, Y, X) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n } errs <- function(b,beta){ TP = sum((abs(b) > 0 & abs(beta)>0)) FP = sum((abs(b)>0 & beta==0)) c(TP,FP) }
library(glmnet) set.seed(20181023) n=100 p=300 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n)) Y
library(glmnet) set.seed(20181023) n=100 p=_____ X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n)) grade_result( pass_if(~ identical(.result, Y), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
aic <- function(cvglmnetobj, Y, X){ like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)/length(Y) } bic <- function(cvglmnetobj, Y, X){ n = length(Y) like = colMeans( (Y - predict(cvglmnetobj$glmnet.fit, newx=X))^2) df = colSums(abs(out$glmnet.fit$beta) > 0 ) log(like) + 2*(df+1)*log(n)/n } errs <- function(b,beta){ TP = sum((abs(b) > 0 & abs(beta)>0)) FP = sum((abs(b)>0 & beta==0)) c(TP,FP) } library(glmnet) set.seed(20181023) n=100 p=300 X = matrix(rnorm(n*p),n) beta = c(rep(1,5),rep(0,p-5)) Y = c(X %*% beta + rnorm(n))
Run the following code to produce a table of values that shows the accuracy of the AIC, BIC, and CV methods.
out = cv.glmnet(X,Y, intercept=FALSE) bestmodels = list( AIC = out$glmnet.fit$beta[,which.min(aic(out, Y, X))], BIC = out$glmnet.fit$beta[,which.min(bic(out, Y, X))], CV = c(coef(out, s='lambda.min')[-1,]) ) tab = sapply(bestmodels,errs,beta=beta) rownames(tab) = c('true positives','false positives') kable(tab)
quiz(caption="Questions", question("Which method had the lowest number of false positives?", answer("AIC"), answer("BIC", correct=TRUE), answer("CV"), answer("All the same"), allow_retry = TRUE, correct = paste0("Correct!") ), question("Which method had the highest number of true positives?", answer("AIC"), answer("BIC"), answer("CV"), answer("All the same.",correct = TRUE), allow_retry = TRUE, correct = paste0("Correct!") ) )
Did you notice any difference between the low and large number of predictors?
There's not a huge difference in low dimensions. AIC and CV are essentially the same, picking slightly larger models than BIC. With $p=300$, AIC drastically over-selects (it chooses the largest model it can, based on the path). BIC does quite well, CV slightly over-selects. In all cases, the true coefficients are always included (the signal-to-noise ratio is pretty large).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.