knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(bis557)
We should solve the equation $\mathbf{X}^{\intercal}\mathbf{X} \boldsymbol{\beta} =\mathbf{X}^{\intercal}\mathbf{Y}$, and get explicit formula of $$ \begin{pmatrix} \widehat \beta_0\ \widehat \beta_1 \end{pmatrix}=\boldsymbol{\widehat\beta} = (\mathbf{X}^{\intercal}\mathbf{X})^{-1}\mathbf{X}^{\intercal}\mathbf{Y}$$
$$ \mathbf{X}^{\intercal}\mathbf{X} = \begin{pmatrix} 1 & 1 & \dots 1 \ x_1 & x_2 & \dots x_n \end{pmatrix} \begin{pmatrix} 1 & x_1\ 1 & x_2 \ \\vdots & \vdots \1 & x_n \end{pmatrix} = \begin{pmatrix} \sum_{i=1}^n 1 & n \bar{x} \ n \bar{x} & \sum_{i=1}^n x_i^2 \end{pmatrix} $$ $$ (\mathbf{X}^{\intercal}\mathbf{X})^{-1} = \frac{1}{n\sum_{i=1}^n x_i^2- n^2\bar{x}^2} \begin{pmatrix} \sum_{i=1}^n x_i^2 & -n \bar{x} \ -n \bar{x} & n \end{pmatrix} =\frac{1}{n\sum_{i=1}^n (x_i-\bar{x})^2} \begin{pmatrix} \sum_{i=1}^n x_i^2 & -n \bar{x} \ -n \bar{x} & n \end{pmatrix} $$ $$ \mathbf{X}^{\intercal}\mathbf{Y} = \begin{pmatrix} 1 & 1 & \dots 1 \ x_1 & x_2 & \dots x_n \end{pmatrix}\begin{pmatrix} y_1 \ y_2 \ \vdots \y_n \end{pmatrix} = \begin{pmatrix} n \bar{y}\ \sum_{i=1}^n x_iy_i \end{pmatrix} $$
$$ \begin{align} \boldsymbol{\widehat\beta}=\begin{pmatrix} \widehat \beta_0\ \widehat \beta_1 \end{pmatrix} &= \frac{1}{n\sum_{i=1}^n (x_i-\bar{x})^2} \begin{pmatrix} \sum_{i=1}^n x_i^2 & -n \bar{x} \ -n \bar{x} & n \end{pmatrix}\begin{pmatrix}n \bar{y}\\sum_{i=1}^n x_iy_i\end{pmatrix}\ &= \frac{1}{n\sum_{i=1}^n (x_i-\bar{x})^2}\begin{pmatrix} n \bar{y}\sum_{i=1}^n x_i^2-n \bar{x}\sum_{i=1}^n x_iy_i\ -n \bar{x}n \bar{y}-n\sum_{i=1}^n x_iy_i \end{pmatrix}\ &= \frac{1}{\sum_{i=1}^n (x_i-\bar{x})^2}\begin{pmatrix} \bar{y}\sum_{i=1}^n x_i^2-\bar{y}\bar{x}^2+\bar{y}\bar{x}^2- \bar{x}\sum_{i=1}^n x_iy_i\ \sum_{i=1}^n (x_iy_i-\bar{x}\bar{y}) \end{pmatrix}\ &=\begin{pmatrix} \frac{\bar{y}(\sum_{i=1}^n x_i^2-\bar{x})^2-\bar{x}(\sum_{i=1}^n x_iy_i-\bar{x}\bar{y})}{\sum_{i=1}^n (x_i-\bar{x})^2}\ \frac{\sum_{i=1}^n x_iy_i-\bar{x}\bar{y}}{\sum_{i=1}^n (x_i-\bar{x})^2} \end{pmatrix} \end{align}$$ In this case, we have $$ \widehat \beta_1 = \frac{\sum_{i=1}^n x_iy_i-\bar{x}\bar{y}}{\sum_{i=1}^n (x_i-\bar{x})^2} $$ And $$\widehat \beta_0 = \bar y - \widehat \beta_1 \bar x$$
data(iris) model_gradient_descent <- gradient_descent(Sepal.Length ~ Petal.Width, iris) model_gradient_descent_new <- gradient_descent_new(Sepal.Length ~ Petal.Width, iris) print(model_gradient_descent$coefficients) print(model_gradient_descent_new$coefficients)
n <- 1000 p <- 5 beta <- c(1,-1,0,0,2) set.seed(2) X <- matrix(rnorm(n*p), nrow=n, ncol = p) alpha <- 0.05 X[,1] <- X[,1] * alpha + X[,2] * (1 - alpha) set.seed(1) Y <- X %*% beta + rnorm(n) model_ridge_regression = ridge_regression(X,Y,1.2) print(model_ridge_regression$coefficients)
n <- 500 p <- 5 beta <- c(1,-1,0,0,2) set.seed(9999) X <- matrix(rnorm(n * p), ncol = p) alpha <- 0.05 X[,1] <- X[,1] * alpha + X[,2] * (1 - alpha) Y <- X %*% beta + rnorm(n) lambdas <- seq(0.1,10, by=0.1) lambda_ridge = optimize_ridge(X,Y,lambdas,k=5) print(lambda_ridge)
$$ \frac{1}{2n} ||Y - X \beta||_2^2 + \lambda ||\beta||_1. $$ Show that if $|X_j^TY| \leq n \lambda$, then $\widehat \beta^{\text{LASSO}}$ must be zero.
Firstly, we assume that each elements in X are indpendent of each other, which means $||X||2^2= I$ Then: $$ \begin{aligned} L &= \frac{1}{2n} ||Y - X \beta||_2^2 + \lambda ||\beta||_1\ &= \frac{1}{2n}(Y^{\intercal}Y + \beta^{\intercal}X^{\intercal}X\beta - 2\beta^{\intercal}X^{\intercal}Y) + \lambda |\beta|\ &= \frac{1}{2n}Y^{\intercal}Y + \frac{1}{2n}\sum{j=1}^{p}(\beta_j^2-2\beta_jX_j^{\intercal}Y+2n\lambda |\beta_j|) \end{aligned} $$
Let $f(\beta_j)=\beta_j^2-2\beta_jX_j^{\intercal}Y+2n\lambda |\beta_j|$
When $\beta_j = 0$, $f(\beta_j) = 0$,then$\hat\beta^{\text {lasso}}=\underset{\beta}{\operatorname{argmin}} f(\beta)=0$\
When $\beta_j > 0$, then $$ \frac{d f(\beta_j)}{d \beta_j}=2\beta_j-2X_j^TY+2n\lambda=0 $$ We have $$\hat\beta^{\text {lasso}}= X_j^{\intercal}Y - n\lambda > 0 $$ If $|X_j^{\intercal}Y| \leq n \lambda$. Then the equation must hold. So $|X_j^{\intercal}Y| = n \lambda$. Then $$\hat\beta^{\text {lasso}}=0 $$
When $\beta_j < 0$, then $$ \frac{d f(\beta_j)}{d \beta_j}=2\beta_j-2X_j^TY-2n\lambda=0 $$ We have $$\hat\beta^{\text {lasso}}= X_j^{\intercal}Y + n\lambda < 0 $$ If $|X_j^{\intercal}Y| \leq n \lambda$. Then the equation must hold. So $|X_j^{\intercal}Y| = n \lambda$. Then $$\hat\beta^{\text {lasso}}=0 $$
In conclusion, if $|X_j^{\intercal}Y| \leq n \lambda$, then $\widehat \beta^{\text{LASSO}}$ must be zero.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.