knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(bis557homework2)
Let $X=\left[\begin{array}{cc} 1&x_1\ \vdots&\vdots\ 1&x_n \end{array}\right]$, then $X^T$ will be $\left[\begin{array}{ccc} 1&\cdots&1\ x_1&\cdots&x_n \end{array}\right]$
Let $Y$ be the matrix of $\left[\begin{array}{c} y_1\ \vdots\ y_n \end{array}\right]$
We know that $\beta$ can be written as $\beta=(X^TX)^{-1}X^TY$
Using what we have before, we can express $\beta$ as
$$\begin{array}{l} \beta=(X^TX)^{-1}X^TY\ =\left(\left[\begin{array}{cc} n&\sum x_i\ \sum x_i&\sum x_i^2 \end{array}\right]\right)^{-1}X^TY\ =\frac{1}{n\sum x_i^2-\sum x_i\sum x_i}\left[\begin{array}{cc} \sum x_i^2&-\sum x_i\ -\sum x_i& n \end{array}\right] X^T Y\ =\frac{1}{n\sum x_i^2-\sum x_i\sum x_i}\left[\begin{array}{c} \sum x_i^2\sum y_i-\sum x_i\sum x_iy_i\ -\sum x_i\sum y_i+n\sum x_i y_i \end{array}\right] \end{array}$$
Therefore, $\beta_0=\frac{\sum x_i^2\sum y_i-\sum x_i\sum x_iy_i}{n\sum x_i^2-\sum x_i\sum x_i}$, $\beta_1=\frac{-\sum x_i\sum y_i+n\sum x_iy_i}{n\sum x_i^2-\sum x_i\sum x_i}$
#Build up the OLS casl_ols_svd <-function(X, y){ svd_output <- svd(X) r <- sum(svd_output$d > .Machine$double.eps) U <- svd_output$u[, 1:r] V <- svd_output$v[, 1:r] beta <- V %*% (t(U) %*% y / svd_output$d[1:r]) beta } # Gradient Descent cost <- function(X, y, beta) { sum( (X %*% beta - y)^2 ) / (2*length(y)) } gradient <- function(X, y, alpha, iter_nums) { cost_history <- double(iter_nums) theta <- as.matrix(rep(0, ncol(X))) for (i in 1:iter_nums) { error <- (X %*% theta - y) delta <- t(X) %*% error / length(y) theta <- theta - alpha * delta cost_history[i] <- cost(X, y, theta) } return(theta) } # Build up the database set.seed(1) n <- 1e4; p <- 4 X <- matrix(rnorm(n*p), ncol = p) beta <- c(1,2,3,4) epsilon <- rnorm(n) y <- X %*% beta + epsilon # OLS on entire database beta_h_svd <- casl_ols_svd(X, y) error_ols <- sqrt(sum((y - X %*% beta_h_svd)^2)) error_ols #To do the Cross-Validation for Gradient Descent error_gd_cv <- rep(0, 10) folds <- cut(seq(1,nrow(X)),breaks=10,labels=FALSE) # Perform 10 fold cross validation for(i in 1:10){ testIndexes <- which(folds==i,arr.ind=TRUE) X_train <- X[testIndexes, ] y_train <- y[testIndexes, ] X_test <- X[-testIndexes, ] y_test <- y[-testIndexes, ] beta_gd <- gradient(X_train, y_train, 0.5, 10000) error_gd_cv[i] <- sqrt(sum((y_test - X_test %*% beta_gd)^2)) } error_gd <- mean(error_gd_cv) error_gd
From the previous code calculation, we can get that the error getting from OLS function is 101.8349. The error getting from Gradient Descent is 96.8142. Do the comparison, it is not hard to see that they are pretty close.
n1 <- 1000 p1 <- 25 beta1 <- c(1, rep(0, p1 - 1)) X1 <- matrix(rnorm(n1 * p1), ncol = p1) alpha1 <- 0.001 X1[,1] <- X1[,1] * alpha1 + X1[,2] * (1 - alpha1) # Ridge regression function My_ridge_regression <- function(X, y, lambda) { n1 <- dim(X1)[2] solve( crossprod(X1) + lambda * diag(n1), crossprod(X1, y1)) }
N1 <- 1e4; l2_errors <- rep(0, N1) for (k in 1:N1) { y1 <- X1 %*% beta1 + rnorm(n1) betahat1 <- My_ridge_regression(X1, y1, 0.1) l2_errors[k] <- sqrt(sum((betahat1 - beta1)^2)) } mean(l2_errors)
set.seed(4) n <- 200 p <- 4 N <- 500 beta <- c(1, -1, 0.5, 0) X <- matrix(rnorm(n*p), ncol = p) y <- X %*% beta + rnorm(n, sd = 5) X_test <- matrix(rnorm(n*p), ncol = p) y_test <- X_test %*% beta + rnorm(n, sd = 5) y_test <- as.numeric(y_test) lambda_vals <- seq(0, n*2, length.out = N) casl_lm_ridge <-function(X, y, lambda_vals){ svd_obj <- svd(X) U<- svd_obj$u V <- svd_obj$v svals <- svd_obj$d k <- length(lambda_vals) ridge_beta <- matrix(NA_real_, nrow = k, ncol = ncol(X)) for (j in seq_len(k)){ D <- diag(svals / (svals^2 + lambda_vals[j])) ridge_beta[j,] <- V %*% D %*% t(U) %*% y } ridge_beta } beta_mat <- casl_lm_ridge(X, y, lambda_vals) y_hat <- tcrossprod(X_test, beta_mat) mse <- apply((y_hat - y_test)^2, 2, mean) lambda_vals[which.min(mse)]
In this question, I use the grid search to find the best lambda, which makes the best estimation error -- the smallest.
For $\arg\min\frac{1}{2n}\|Y-X\beta\|^2_2+\lambda\|\beta\|_1$. According to the KKT solutions, $\hat{\beta}k$ is solution iff $X^T(Y-X\hat{\beta})=n\lambda\gamma$ and $\gamma\in\left{\begin{array}{l} sign\hat{\beta_i}\text{ where }\hat{\beta_i}\neq0\ [-1,1]\text{ where }\hat{\beta_i}=0 \end{array}\right.$
Assume $\hat{\beta_i}\neq0$, then according to the condition given by the question that $|X^TY|\leq n\lambda$, we have that $X^{T}(Y-X\hat{\beta})\neq n\lambda\gamma$, since $\gamma=\pm1$.
Therefore, what we assume is incorrect and $\hat{\beta_i}=0$.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.