knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(message = FALSE, warning = FALSE) knitr::opts_chunk$set(fit.align = 'center') options(markdown.extensions = c('latex_macros')) # set inline number of digits to two n_digits = 2 inline_hook <- function(x) { if(is.numeric(x)) x <- round(x, n_digits) paste(as.character(x), collapse=", ") } knitr::knit_hooks$set(inline = inline_hook) pacman::p_load(knitr) pacman::p_load(MASS) pacman::p_load(GGally) pacman::p_load(devtools) pacman::p_load(tidyverse) pacman::p_load(kknn) pacman::p_load(caret) pacman::p_load(ElemStatLearn) pacman::p_load(foreach) pacman::p_load(doMC) # use all the cores. yolo registerDoMC(cores = detectCores())
\newcommand{\Cov}[2]{\mathrm{Cov}\left(#1,\ #2\right)} \newcommand{\Trace}[1]{\mathrm{Trace}\left( #1 \right)} \newcommand{\mat}[1]{\mathbf{ #1 }}
a) Calculate the sample variance-covariance matrix $\hat{\Sigma}$ of $X$ (using the maximum likelihood estimator, not the unbiased version). Then calculate $\hat{\Sigma}^{-\frac{1}{2}}$.
The MLE estimator of the sample variance-covariance is given by the following formula
$$\hat{\Sigma} = \frac{1}{n} \sum_{k = 1}^{n}(X_i - \bar{X})^{T}(X_{i} - \bar{X})$$ where $X_i \in \mathbb{R}^{p \times 1}$ and $\bar{X}$ is the sample mean. This is calculated by the following function:
#' Sample Variance-Covariance Matrix #' #' Calculates the sample variance-covariance matrix using the biased MLE estimate. #' #' @param X A matrix of shape [n_samples, n_features]. #' @return The covariance matrix of \code{X}. Has shape [n_features, n_features]. sample_covariance <- function(X) { n_samples <- nrow(X) # center the data matrix first X <- scale(X, center = TRUE, scale = FALSE) (1 / n_samples) * t(X) %*% X }
Using this function we can calculate the $\hat{\Sigma}$ matrix of the given data.
set.seed(1) # generate toy dataset P = 4 N = 200 rho = 0.5 V <- rho^abs(outer(1:P, 1:P, "-")) X = as.matrix(mvrnorm(N, mu=rep(0,P), Sigma=V)) beta = as.matrix(c(1, 1, 0.5, 0.5)) Y = X %*% beta + rnorm(N) # sigma-hat calculation (sigma <- sample_covariance(X))
To find the matrix square root we can use the spectral decomposition of $\Sigma$ (since covariance matrices are positive semi-definite we know it is diagonalizable and its eigen-values are positive, so the following argument is valid):
$$ \Sigma = Q \Lambda Q^{-1} \Rightarrow \Sigma^{\frac{1}{2}} = Q \sqrt{\Lambda} Q^{-1}$$ where $Q$ is the matrix eigen-vectors of $\Sigma$ and $\Lambda$ is a matrix with the eigenvalues on the diagonal. Once we have $\Sigma^{\frac{1}{2}}$ we can invert it to get $\Sigma^{-\frac{1}{2}}$. In R:
#' Matrix Square Root #' #' Calculates the square root of a matrix by using its spectral decomposition. #' #' @param X A matrix #' @param symmetric If \code{TRUE}, the matrix is assumed to be symmetric. #' Passed to \code{eigen}. #' @return A matrix of the same shape as \code{X}. matrix_sqrt <- function(X, symmetric = FALSE) { # Perform the spectral decomposition. # Covariance matrices are symmetric, so we should pass on this optimization. X_eig <- eigen(X, symmetric = symmetric) # extract the Q eigen-vector matrix and the eigen-values Q <- X_eig$vectors values <- X_eig$values Q_inv <- solve(Q) # Q^{-1} Q %*% diag(sqrt(values)) %*% Q_inv } # inverse of the matrix square-root of sigma (sigme^{-1/2}) solve(matrix_sqrt(sigma, symmetric = TRUE))
b) We want to perform a 5-NN estimation on the target point $x = (0.5, 0.5, 0.5, 0.5)^{T}$. To do this, lets first write a function mydist <- function(x1, x2)
that will return the Euclidean distance between any two vectors x1
and x2
. Calculate the distance from all sample points to the target point x and output the row numbers of the closest 5 subjects. Use their $Y$ values to obtain the 5-NN estimation at the target point.
First we need to write a function that calculates the euclidean distance between any two vectors:
#' Euclidean Distance #' #' Calculates the euclidean distance between two vectors #' \code{x1} and \code{x2}. #' #' @param x1,x2 numeric vectors. #' @return the euclidean distance between the inputs. mydist <- function(x1, x2) { # sanity check that x1 and x2 have the same number of features. if (length(x1) != length(x2)) stop("`x1` and `x2` must be the same length.") sqrt(sum((x1 - x2)^2)) }
Then to find the k-nearest neighbors to a vector $y$ we need a function the calculates the distance between $y$ and all points in a data matrix $X$, and then returns the $k$ smallest elements.
#' Nearest Neighbors #' #' Find the k nearest neighbors from a point \code{x_query} #' and all rows in a dataset \code{X}. #' #' @param X Feature matrix of shape [n_samples, n_features]. #' @param x_query A single vector of shape [n_features]. #' @param k The number of nearest neighbors to compute. #' @param dist_func A function that defines a valid distance. #' The default is euclidean distance. #' @param ... These parameters are passed into \code{dist_func}. #' @return The indices of the k-nearest neighbors. nearest_neighbors <- function(X, x_query, k, dist_func = mydist, ...) { # If k >= the number of points they are # all nearest-neighbors if (k >= nrow(X)) return(1:nrow(X)) # bind distance function to the query vector partial_dist <- function(x) { dist_func(x, x_query, ...) } # row-wise distances distances <- apply(X, 1, partial_dist) # kth smallest values max_value <- sort(distances, partial = k)[k] # indices that are less than or equal to the kth smallest value # are the nearest-neighbors which(distances <= max_value)[1:k] }
We can use this function to find the nearest-neighbors of $x = (0.5, 0.5, 0.5, 0.5)^T$ in the dataset $X$:
x <- c(0.5, 0.5, 0.5, 0.5) nearest_neighbors(X, x_query = x, k = 5)
Finally, we can make predictions by taking the sample average of the k-nearest neighbors.
#' K-Nearest Neighbor Predictions for a Single Data-Point #' #' Make predictions on a data point \code{x_query} #' based on the k-nearest neighbors in the dataset #' \code{X}. Predictions are made by averaging the #' \code{Y} values of the k-neighest neighbors. #' #' @param X dataset of shape [n_samples, n_features] #' @param Y target vector of shape [n_samples] #' @param x_query vector of shape [n_features] to make predictions on. #' @param k number of nearest neighbors to use for predictions. #' @param dist_func function used to calculate the distance between two points. #' @param ... These arguments are passed into \code{dist_func}. #' @return A single prediction for \code{x_query}. predict_knn_single <- function(X, Y, x_query, k, dist_func = mydist, ...) { neighbors <- nearest_neighbors(X, x_query = x_query, k = k, dist_func = dist_func, ...) mean(Y[neighbors]) }
For the point $x = (0.5, 0.5, 0.5, 0.5)^T$ we have
predict_knn_single(X, Y, x_query = x, k = 5)
c) Write another function mydist2 <- function(x1, x2, s)
that returns the Mahalanobis distance between any two vectors x1
and x2
using any covariance matrix s
. Redo the steps in b) to find the 5-NN estimation based on this Mahalanobis distance with s
$= \hat{\Sigma}$.
The Mahalanobis distance is defined as
$$d(x, y) = \sqrt{(x - y)^T \Sigma^{-1} (x - y)}$$ where $\Sigma$ is the covariance matrix. In R:
#' Mahalanobis Distance #' #' Calculates teh mahalanobis distance between two vectors #' \code{x1} and \code{x2} using a arbitrary covariance #' matrix \code{s}. #' #' @param x1,x2 numeric vectors. #' @param s covariance matrix. #' @return the euclidean distance between the inputs. mydist2 <- function(x1, x2, s) { if (length(x1) != length(x2)) stop('`x1` and `x2` must have the same length.') if (nrow(s) != length(x1) || ncol(s) != length(x2)) stop('`s` must be square and nrow = ncol = lengh(xi)') diff <- matrix(x1 - x2, nrow = 1) s_inv <- solve(s) as.numeric(sqrt(diff %*% s_inv %*% t(diff))) }
Repeating 1.b with the new distance function and the $\Sigma$ matrix calculated in 1.a:
predict_knn_single(X, Y, x_query = x, k = 5, dist_func = mydist2, s = sigma)
d) Which estimator seems to perform better? Can you give an explanation?
To evaluate the efficiency we create a train-test split of the dataset
#' Create a train-test split of a dataset. #' #' For model evaluation it is useful to split a #' dataset into a training and testing set. This #' helper functions shuffles the dataset and then #' splits off `test_size` percentage of the dataset for #' testing. #' #' @param X matrix holding the covariates. #' @param y a vector or matrix holding the target vector. #' @param test_size A number between 0 and 1 indicating the fraction #' of the dataset to use for testing. #' @param shuffle Boolean indicating whether to shuffle the dataset. #' @param seed Random seed for the rng. #' @return A list containing `X.train`, `X.test`, `y.train`, and #' `y.test`. These are matrices holding the training and testing #' portion of the `X` and `y`. train_test_split <- function(X, y = NULL, test_size = 0.2, shuffle = TRUE, seed = 123) { if (is.vector(y)) { y <- as.matrix(y) } if (!is.null(y) && nrow(X) != nrow(y)) { stop("`X` and `y` must have the same number of rows.") } if (!is.null(seed)) { set.seed(seed) } n_samples <- nrow(X) n_test <- floor(n_samples * test_size) n_train <- n_samples - n_test if (shuffle) { indices <- sample(n_samples) } else { indices = 1:n_samples } X_train <- X[1:n_train, ] X_test <- X[(n_train + 1):n_samples, ] if (!is.null(y)) { y_train <- y[1:n_train] y_test <- y[(n_train + 1):n_samples] } else { y_train <- NULL y_test <- NULL } list(X.train = X_train, X.test = X_test, y.train = y_train, y.test = y_test) } splits <- train_test_split(X, Y)
In addition, we use the following function to return predictions on a query matrix $X$ (as opposed to a single point).
#' K-Nearest Neighbors Regression #' #' Make predictions for each point in a query dataset #' \code{X_query} based on the k-nearest neighbors in the dataset #' \code{X}. Predictions are made by averaging the #' \code{Y} values of the k-neighest neighbors. #' #' @param X matrix of shape [n_samples, n_features] #' @param Y target vector of shape [n_samples] #' @param X_query matrix of shape [n_queries, n_features] to make predictions on. #' @param k number of nearest neighbors to use for predictions. #' @param dist_func function used to calculate the distance between two points. #' @param ... These arguments are passed into \code{dist_func}. #' @return A matrix of prediction for \code{X_query}. predict_knn <- function(X, Y, X_query, k, dist_func = mydist, ...) { # wrap `predict_knn_single` to use it in a row-wise apply. knn.predict <- function(x_query) { predict_knn_single(X = X, Y = Y, x_query = x_query, k = k, dist_func = dist_func, ...) } apply(X_query, 1, knn.predict) }
Finally, we need a function to determine the mean squared error over new data-points:
#' Mean Squared Error mean_squared_error <- function(y_true, y_pred) { y_true <- as.numeric(y_true) y_pred <- as.numeric(y_pred) mean( (y_true - y_pred)^2 ) }
We then fit a $k = 5$ $k$NN model on the training data and record the mean-squared error on the test-set. For the $k$NN model using Euclidean distance:
# euclidean distance preds <- predict_knn(X = splits$X.train, Y = splits$y.train, X_query = splits$X.test, k = 5) (euclidean_error <- mean_squared_error(preds, splits$y.test))
We perform the same calculation using Mahalanobis distance. Note that we calculate $\hat \Sigma$ only on the training set.
# Mahalanobis distance sigma = sample_covariance(splits$X.train) preds <- predict_knn(X = splits$X.train, Y = splits$y.train, X_query = splits$X.test, k = 5, dist_func = mydist2, s = sigma) (mbs_error <- mean_squared_error(preds, splits$y.test))
In the end we find that the model using Euclidean distance outperforms the model using Mahalanobis distance in terms of mean squared error on the test set. This could happen for a variety of reasons. The benifit of Mahanobis distance is that it puts the co-variates on the same scale. However in this case every $X_i$ is already on roughly the same scale ($X_i \sim N(0, 1)$), so we do not get that benifit over the Euclidean distance metric. In addition, we did not perform a grid-search over the number of nearest neighbors. It could be the case the Mahalonobis distances outperforms Euclidean distance for certain values of $k$.
a) If we are interested in using $k = 5$, derive the degrees of freedom of this model using the given formula.
We are interested in finding the degrees of freedom for a general $k$NN model. In order to do this, lets develop some notation for the $k$NN estimator. We have that
$$ \hat{f}(x) = \frac{1}{k}\sum_{x_i \in N_k(x)} y_i $$ where $N_k(x)$ stands for the neighborhood of $x$ defined by the $k$ closest points $x_i$ in the training dataset. If we define the following indicator function
$$ w(x_i, x_j) = \begin{cases} 1, & \text{if } x_i \in N_k(x_j) \ 0, & \text{otherwise} \end{cases} $$
we can re-write the $k$NN estimator as
$$ \hat{f}(x) = \frac{1}{k}\sum_{i = 1}^N w(x_i, x_j) \ y_i \ . $$
Since the degrees of freedom of an estimator is defined as,
$$ df(\hat f) = \frac{1}{\sigma^2} \sum_{i = 1}^N \Cov{\hat f(x_i)}{y_i} $$ we can focus on evaluating the covariance inside the sum
$$ \begin{split} \Cov{\hat f(x_j)}{y_j} &= \Cov{\frac{1}{k}\sum_{i = 0}^N w(x_i, x_j) \ y_i}{y_j} \ &= \frac{1}{k} \sum_{i = 0}^N w(x_i, x_j) \ \Cov{y_i}{y_j} \ &= \frac{1}{k} \sum_{i = 0}^N w(x_i, x_j) \ \sigma^2 \ \delta_{ij} \quad (\text{assuming } y_i \text{ uncorrelated}) \ &= \frac{\sigma^2}{k}. \end{split} $$
Thus,
$$ df(\hat f) = \frac{1}{\sigma^2} \sum_{i = 0}^N \frac{\sigma^2}{k} = \frac{N}{k}.$$
For $k = 5$ we find that $df(\hat f) = \frac{N}{5}$ where $N$ is the number of training samples.
b) Perform the simulation study.
First we generate a design matrix $\mathbf{X}$ from $p$ independent standard normal distribution with $n = 200$ and $p = 4$.
generate_features <- function(n_samples = 200, n_features = 4, seed = NULL) { if (!is.null(seed)) { set.seed(seed) } mu <- rep(0, n_features) cov <- diag(n_features) mvrnorm(n = n_samples, mu = mu, Sigma = cov) } X <- generate_features()
Now we will define the true data model as
$$ f(X) = 20x_1 x_2 + 20 x_3^2 + 10 x_4 + \epsilon $$
where $\epsilon \overset{iid}{\sim} N(0, 1)$. In R:
generate_data <- function(n_samples = 200, n_features = 4, seed.X = NULL, seed.Y = NULL) { X <- generate_features(n_samples = n_samples, n_features = n_features, seed = seed.X) if (!is.null(seed.Y)) { set.seed(seed.Y) } else { # we un-set the seed here if it was set previously. set.seed(Sys.time()) } error <- rnorm(n_samples) Y <- 20 * X[, 1] * X[, 2] + 20 * X[, 3]^2 + 10 * X[, 4] + error list(X = X, Y = Y) } data <- generate_data(seed.X = 1, seed.Y = 1)
Out of interest I also visualized the data in a scatter plot matrix. The univeriate relationships are estimated using a LOESS smoother:
ggscatmat(as.data.frame(data), alpha = 0.8) + geom_smooth(se = FALSE) + theme_bw()
We now fit a $k$NN model to this synthetic dataset. Note that I am using knnreg
and later on knn3
from the caret
package. caret
just wraps kknn
and knn
with some performance improvements and a more standard fit/predict interface. Fitting the model in R:
# We use knnreg from the caret package. This is a wrapper around # the kknn with speed improvements and a better interface. model <- knnreg(data$X, data$Y, k = 5) preds <- predict(model, data$X) head(preds, 10)
We want to get a good estimate for $\Cov{\hat y_i} {y_i}$, so we perform 20 simulations and average the result:
n_iter <- 20 n_samples <- 200 # Each column of these matrices contains the # \hat y_i or y_i predicitions for each iteration # (a vector of length n_iter). y_pred <- matrix(nrow = n_iter, ncol = n_samples) y_true <- matrix(nrow = n_iter, ncol = n_samples) for (i in 1:n_iter) { # for reproducability we still set the seed for `Y` to be the # iteration number. data <- generate_data(n_samples = n_samples, seed.X = 123, seed.Y = i) model <- knnreg(x = data$X, y = data$Y, k = 5) y_pred[i, ] <- predict(model, data$X) y_true[i, ] <- data$Y } # helper function to calculate the MLE sample covariance between two vectors. sample_cov <- function(x, y) { n_samples <- nrow(X) scale_factor <- (n_samples - 1) / n_samples scale_factor * cov(x, y) } # sum of row-wise covariances (df <- sum( mapply(sample_cov, as.data.frame(y_true), as.data.frame(y_pred)) ))
We see that $d\hat f =$ r df
, which is close to the theoretical value of $N / k =$ r n_samples / 5
.
c) Consider the linear model $\mathbf{y = X \beta + \epsilon}$, and the fitted value from linear regression $\mathbf{\hat y = X \hat\beta = X(X^TX)^{-1}X^Ty}$. For simplicity, lets assume the $\epsilon_i$'s are i.i.d normal with mean 0 and variance $\sigma^2$. Recall the alternative definition of the degrees of freedom:
$$df(\hat f) = \frac{1}{\sigma^2} \Trace{ \Cov{ \mathbf{\hat y} }{ \mathbf{y} } }.$$
We want to calculate the theoretical degrees of freedom of the linear regression model. We can apply the definition directly
$$ \begin{split} df(\hat f) &= \frac{1}{\sigma^2} \Trace{ \Cov{X(X^T X)^{-1}X^Ty}{y} } \ &= \frac{1}{\sigma^2} \Trace{ X(X^T X)^{-1}X^T \Cov{y}{y} } \ &= \Trace{ X(X^T X)^{-1}X^T } \quad, \text{ since } \Cov{y}{y} = \sigma^2 \ &= \Trace{ X^T X (X^T X)^{-1} } \ &= \Trace{I_{p \times p}} \ &= p \end{split} $$
So the theoretical degrees of freedom of the linear regression model is $p$ -- the number of coefficients being estimated.
SAheart
dataset from the ElemStatLearn
package. Conisder the $k$NN model using two variables, age
and tobacco
to model the binary outcome chd
(coronary heart disease) Use 10-fold cross validation to select the best k. Also report your training error and plot the average cross-validation error curve for different choices of k. First we load the dataset and visualize the relationship chd ~ age + tobacco
that we want to model. Note that I am using dplyr
and tidry
for a lot of the data wrangling.
# explicitly cast `chd` as a factor saheart <- SAheart %>% mutate(chd = as.factor(chd)) ggplot(data = saheart, aes(x = age, y = tobacco, color = chd)) + geom_point() + scale_color_manual(values = c('darkorange', 'deepskyblue')) + theme_bw()
Now we use 10-fold cross-validation to find the best $k$NN model
set.seed(123) n_samples <- nrow(saheart) n_folds <- 10 k_max <- 50 # assign fold indices to the samples cv <- sample(rep(1:n_folds, length.out = n_samples)) # matrices to hold train/test error test_loss <- matrix(nrow = k_max, ncol = n_folds) colnames(test_loss) <- paste0('Fold', 1:n_folds) train_loss <- matrix(nrow = k_max, ncol = n_folds) colnames(train_loss) <- paste0('Fold', 1:n_folds) # misclassification error metric. misclassification_error <- function(y_true, y_pred) { mean(y_true != y_pred) } # exhaustively search over all values of k for(k in 1:k_max) { # run each fold in parallel results <- foreach(fold = 1:n_folds, .combine = 'rbind') %dopar% { set.seed(42) # training/testing folds train <- cv != fold test <- cv == fold # fit a model to the training data model <- knn3(chd ~ tobacco + age, data = saheart, subset = train, k = k) # extract predictions / error on training set train_preds <- predict(model, saheart[train, ], type = 'class') train_error <- misclassification_error(saheart[train, 'chd'], train_preds) # extract predicitons / error on test set test_preds <- predict(model, saheart[test, ], type = 'class') test_error <- misclassification_error(saheart[test, 'chd'], test_preds) c(train_error, test_error) } # aggregate the results train_loss[k, ] = results[, 1] test_loss[k, ] = results[, 2] } # the best k value has the minimum mean loss over the folds. (best_k <- which.min(rowMeans(test_loss)))
The average training error for $k =$ r best_k
is r rowMeans(train_loss)[best_k]
, and the average testing error is r rowMeans(test_loss)[best_k]
.
Now we visualize the training / testing error curves.
# add a column indicating the value of k used test_data <- as.data.frame(test_loss) %>% mutate(k = row_number()) train_data <- as.data.frame(train_loss) %>% mutate(k = row_number()) # This dataset has 4 columns (set, k, fold, and error): # - set: train or testing set # - k: value of k in the kNN fit # - fold: Fold1 - Fold10 # - error: the value of the error for this value of set, k, and fold. # Essentially a 'tidy' dataframe of the train/test error curves. data <- bind_rows(list(train = train_data, test = test_data), .id = 'set') %>% gather(key = fold, value = error, Fold1:Fold10) # these are the average error curves for train/test grouped_data <- data %>% group_by(k, set) %>% summarize(average_error = mean(error)) ggplot(NULL, aes(x = k, color = set)) + # all training / testing error points geom_point(data = data, aes(y = error), alpha = 0.4) + # average training / testing error line geom_line(data = grouped_data, aes(y = average_error), size = 1) + scale_color_manual(values = c(train = 'deepskyblue', test = 'darkorange')) + theme_bw() + theme(panel.grid.minor = element_blank())
Out of curiousity I also plotted the decision boundary of the model validated on the first CV fold.
set.seed(42) # visualize by showing decision boundary for the classifier trained on the first fold train <- cv != 1 model <- knn3(chd ~ tobacco + age, data = saheart, subset = train, k = best_k) # predictions made on an evenly spaced grid over # the ranges of age and tobacco var_grid <- expand.grid( age = seq(0, max(saheart$age) + 1, 0.5), tobacco = seq(0, max(saheart$tobacco) + 1, 0.5) ) %>% # predicted class labels mutate(y_pred = predict(model, ., type = 'class')) %>% # predicted probabilities (the decision boundary is proba = 0.5) mutate(y_proba = predict(model, .)[, 1]) # plot actual labels and the decision boundary of our classifier ggplot(NULL, aes(x = age, y = tobacco)) + # classification on a grid of points geom_point(data = var_grid, aes(color = y_pred), size = 0.3, alpha = 0.5) + # a scatter plot of the true values geom_point(data = saheart, aes(color = chd)) + # the proba = 0.5 contour (this is the decision boundary) geom_contour(data = var_grid, aes(z = y_proba), breaks = 0.5, color = 'black') + scale_color_manual(values = c('darkorange', 'deepskyblue')) + theme_bw() + theme(panel.grid.minor = element_blank())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.