knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(bis557)

1.CASL 2.11 Exercises problem number 5

We should solve the equation $\mathbf{X}^{\intercal}\mathbf{X} \boldsymbol{\beta} =\mathbf{X}^{\intercal}\mathbf{Y}$, and get explicit formula of $$ \begin{pmatrix} \widehat \beta_0\ \widehat \beta_1 \end{pmatrix}=\boldsymbol{\widehat\beta} = (\mathbf{X}^{\intercal}\mathbf{X})^{-1}\mathbf{X}^{\intercal}\mathbf{Y}$$

$$ \mathbf{X}^{\intercal}\mathbf{X} = \begin{pmatrix} 1 & 1 & \dots 1 \ x_1 & x_2 & \dots x_n \end{pmatrix} \begin{pmatrix} 1 & x_1\ 1 & x_2 \ \\vdots & \vdots \1 & x_n \end{pmatrix} = \begin{pmatrix} \sum_{i=1}^n 1 & n \bar{x} \ n \bar{x} & \sum_{i=1}^n x_i^2 \end{pmatrix} $$ $$ (\mathbf{X}^{\intercal}\mathbf{X})^{-1} = \frac{1}{n\sum_{i=1}^n x_i^2- n^2\bar{x}^2} \begin{pmatrix} \sum_{i=1}^n x_i^2 & -n \bar{x} \ -n \bar{x} & n \end{pmatrix} =\frac{1}{n\sum_{i=1}^n (x_i-\bar{x})^2} \begin{pmatrix} \sum_{i=1}^n x_i^2 & -n \bar{x} \ -n \bar{x} & n \end{pmatrix} $$ $$ \mathbf{X}^{\intercal}\mathbf{Y} = \begin{pmatrix} 1 & 1 & \dots 1 \ x_1 & x_2 & \dots x_n \end{pmatrix}\begin{pmatrix} y_1 \ y_2 \ \vdots \y_n \end{pmatrix} = \begin{pmatrix} n \bar{y}\ \sum_{i=1}^n x_iy_i \end{pmatrix} $$

$$ \begin{align} \boldsymbol{\widehat\beta}=\begin{pmatrix} \widehat \beta_0\ \widehat \beta_1 \end{pmatrix} &= \frac{1}{n\sum_{i=1}^n (x_i-\bar{x})^2} \begin{pmatrix} \sum_{i=1}^n x_i^2 & -n \bar{x} \ -n \bar{x} & n \end{pmatrix}\begin{pmatrix}n \bar{y}\\sum_{i=1}^n x_iy_i\end{pmatrix}\ &= \frac{1}{n\sum_{i=1}^n (x_i-\bar{x})^2}\begin{pmatrix} n \bar{y}\sum_{i=1}^n x_i^2-n \bar{x}\sum_{i=1}^n x_iy_i\ -n \bar{x}n \bar{y}-n\sum_{i=1}^n x_iy_i \end{pmatrix}\ &= \frac{1}{\sum_{i=1}^n (x_i-\bar{x})^2}\begin{pmatrix} \bar{y}\sum_{i=1}^n x_i^2-\bar{y}\bar{x}^2+\bar{y}\bar{x}^2- \bar{x}\sum_{i=1}^n x_iy_i\ \sum_{i=1}^n (x_iy_i-\bar{x}\bar{y}) \end{pmatrix}\ &=\begin{pmatrix} \frac{\bar{y}(\sum_{i=1}^n x_i^2-\bar{x})^2-\bar{x}(\sum_{i=1}^n x_iy_i-\bar{x}\bar{y})}{\sum_{i=1}^n (x_i-\bar{x})^2}\ \frac{\sum_{i=1}^n x_iy_i-\bar{x}\bar{y}}{\sum_{i=1}^n (x_i-\bar{x})^2} \end{pmatrix} \end{align}$$ In this case, we have $$ \widehat \beta_1 = \frac{\sum_{i=1}^n x_iy_i-\bar{x}\bar{y}}{\sum_{i=1}^n (x_i-\bar{x})^2} $$ And $$\widehat \beta_0 = \bar y - \widehat \beta_1 \bar x$$

2.The comparison between the two gradient descent functions.

data(iris)
model_gradient_descent <- gradient_descent(Sepal.Length ~ Petal.Width, iris)
model_gradient_descent_new <- gradient_descent_new(Sepal.Length ~ Petal.Width, iris)
print(model_gradient_descent$coefficients)
print(model_gradient_descent_new$coefficients)

3. A ridge regression function taking into account colinear (or nearly colinear) regression variables.

n <- 1000
p <- 5
beta <- c(1,-1,0,0,2)
set.seed(2)
X <- matrix(rnorm(n*p), nrow=n, ncol = p)
alpha <- 0.05
X[,1] <- X[,1] * alpha + X[,2] * (1 - alpha)
set.seed(1)
Y <- X %*% beta + rnorm(n)
model_ridge_regression = ridge_regression(X,Y,1.2)
print(model_ridge_regression$coefficients)

4. A function optimizing the ridge penalty parameter.

n <- 500
p <- 5
beta <- c(1,-1,0,0,2)
set.seed(9999)
X <- matrix(rnorm(n * p), ncol = p)
alpha <- 0.05
X[,1] <- X[,1] * alpha + X[,2] * (1 - alpha)
Y <- X %*% beta + rnorm(n)
lambdas <- seq(0.1,10, by=0.1)
lambda_ridge = optimize_ridge(X,Y,lambdas,k=5)
print(lambda_ridge)

5. Consider the LASSO penalty

$$ \frac{1}{2n} ||Y - X \beta||_2^2 + \lambda ||\beta||_1. $$ Show that if $|X_j^TY| \leq n \lambda$, then $\widehat \beta^{\text{LASSO}}$ must be zero.

Firstly, we assume that each elements in X are indpendent of each other, which means $||X||2^2= I$ Then: $$ \begin{aligned} L &= \frac{1}{2n} ||Y - X \beta||_2^2 + \lambda ||\beta||_1\ &= \frac{1}{2n}(Y^{\intercal}Y + \beta^{\intercal}X^{\intercal}X\beta - 2\beta^{\intercal}X^{\intercal}Y) + \lambda |\beta|\ &= \frac{1}{2n}Y^{\intercal}Y + \frac{1}{2n}\sum{j=1}^{p}(\beta_j^2-2\beta_jX_j^{\intercal}Y+2n\lambda |\beta_j|) \end{aligned} $$

Let $f(\beta_j)=\beta_j^2-2\beta_jX_j^{\intercal}Y+2n\lambda |\beta_j|$

When $\beta_j = 0$, $f(\beta_j) = 0$,then$\hat\beta^{\text {lasso}}=\underset{\beta}{\operatorname{argmin}} f(\beta)=0$\

When $\beta_j > 0$, then $$ \frac{d f(\beta_j)}{d \beta_j}=2\beta_j-2X_j^TY+2n\lambda=0 $$ We have $$\hat\beta^{\text {lasso}}= X_j^{\intercal}Y - n\lambda > 0 $$ If $|X_j^{\intercal}Y| \leq n \lambda$. Then the equation must hold. So $|X_j^{\intercal}Y| = n \lambda$. Then $$\hat\beta^{\text {lasso}}=0 $$

When $\beta_j < 0$, then $$ \frac{d f(\beta_j)}{d \beta_j}=2\beta_j-2X_j^TY-2n\lambda=0 $$ We have $$\hat\beta^{\text {lasso}}= X_j^{\intercal}Y + n\lambda < 0 $$ If $|X_j^{\intercal}Y| \leq n \lambda$. Then the equation must hold. So $|X_j^{\intercal}Y| = n \lambda$. Then $$\hat\beta^{\text {lasso}}=0 $$

In conclusion, if $|X_j^{\intercal}Y| \leq n \lambda$, then $\widehat \beta^{\text{LASSO}}$ must be zero.



lyran92/bis557 documentation built on Dec. 21, 2020, 10:03 p.m.