In jacobcvt12/GP: Gaussian Processes

2.1 Weight-space View

Standard Linear Model

Typical linear regression models with Gaussian noise follow the parameterization \begin{equation} f(\mathbf{x}) = \mathbf{x}^T\mathbf{w}, \qquad y = f(\mathbf{x}) + \epsilon \end{equation}

where $\mathbf{x}$ is the input, and $\mathbf{w}$ are the parameters of interest relating the input to the output $y$. We assume that $\epsilon$'s are IID Gaussian with mean 0, generally written as $\epsilon \sim \mathcal{N}(0, \sigma_n^2)$.

The likelihood of such a model is written as

\begin{equation} p(\mathbf{y}|X, \mathbf{w}) = \mathcal{N}(X^T\mathbf{w}, \sigma_n^2 I) \end{equation}

We typically use a non-informative prior \begin{equation} \mathbf{w} \sim \mathcal{N}(\mathbf{0}, \Sigma_p) \end{equation}

Combining the prior and likelihood, we arrive at the posterior

\begin{equation} p(\mathbf{w}|X, \mathbf{y}) \sim \mathcal{N}(\mathbf{\bar{w}}=\frac{1}{\sigma_n^2}A^{-1}X\mathbf{y}, A^{-1}) \end{equation}

Where $A=\sigma_n^{-2}XX^T+\Sigma_p^{-1}$.

Example 1

# data
N <- 100
beta <- cbind(c(-4, 2))
sigma2.n <- 1
x <- seq(0, 4, len=N)
X <- rbind(rep(1, N), x)
epsilon <- rnorm(N, sd=sqrt(sigma2.n))
y <- t(X) %*% beta + epsilon

# prior
Sigma.p <- diag(100, nrow(beta))

# posterior 
A <- 1 / sigma2.n + X %*% t(X) + solve(Sigma.p)
w.bar <- 1 / sigma2.n * solve(A) %*% X %*% y
A.inv <- solve(A)

# 95% credible interval
se <- cbind(qnorm(0.975) * sqrt(diag(A.inv))) %*% c(-1, 1)
ci <- se + cbind(w.bar, w.bar)
rownames(ci) <- c("w[1]", "w[2]")
library(xtable)
options(xtable.comment = FALSE)
xtable(ci)

Projections of Inputs into Feature Space

To extend the Bayesian linear model further, we can project a feature $x$ using the space of powers $x: \phi(x) = (1, x, x^2, ...)^T$. Then we now define the function $f$ as

\begin{equation} f(\mathbf{x})=\phi(\mathbf{x})^T\mathbf{w} \end{equation}

Note that for $D$-dimensional input vector $\mathbf{x}$, $\phi$ maps to an $N$-dimensional output, where $N>>D$.

Notational considerations: $\Phi=\Phi(X)$ is the aggregation of columns of $\phi(\mathbf{x})$ and now $A=\sigma_n^{-2} \Phi \Phi^T + \Sigma_p^{-1}$. Additionally, $\phi_=\phi(x_)$ and $K=\Phi^T\Sigma_p\Phi$.

In such a case, the weights $w$ are less important, as it is difficult to make intuitive inference. Instead, we are interested in the posterior predictive distribution:

\begin{equation} f_{}|\mathbf{x}_{},X, \mathbf{y} \sim \mathcal{N}(\frac{1}{\sigma_n^2}\phi(\mathbf{x}{})^TA^{-1}\Phi\mathbf{y}, \phi(\mathbf{x}_{})^TA^{-1}\phi(\mathbf{x}{*})) \end{equation}

Note that this parameterization requires the inversion of $A$, and $N\times N$ matrix, where $N$ may be large. Alternatively, we can rewrite this formulation as \begin{equation} f_{}|\mathbf{x}_{},X, \mathbf{y} \sim \mathcal{N}(\phi_^T\Sigma_p\Phi(K+\sigma_n^2I)^{-1}\mathbf{y},\phi_^T\Sigma_p\phi_-\phi_^T\Sigma_p\Phi(K+\sigma_n^2I)^{-1}\Phi^T\Sigma_p\phi_*) \end{equation}

which involes inversion of smaller matrices.

Since the feature space always enters in a similar fashion we define the covariance function or kernel $k(\cdot, \cdot)$ parameterized as $k(\mathbf{x}, \mathbf{x}')=\phi(\mathbf{x})^T\phi(\mathbf{x}')$.

2.2 Function-space View

# libraries
library(ggplot2)
library(mvtnorm)
library(fields)
library(tidyr)
library(dplyr)
theme_set(theme_classic())

# data
f.points <- data.frame(x=c(-4,-3,-1,0,2), y=c(-2,0,1,2,-1))

# K function
K <- function(X.p, X.q=NULL) {
    sigma <- exp(-0.5 * rdist(X.p, X.q) ^ 2)
    return(sigma)
}

# prior
set.seed(1)
N <- 500 # control fineness
x.star <- seq(-5, 5, len=N) # grid
f.star <- rmvnorm(3, mean=rep(0, N), sigma=K(x.star), method="svd")

# plot draws from prior
data <- data.frame(t(f.star))
data$x <- x.star
data <- gather(data, draw, f, X1:X3)

ggplot(data, aes(x=x, y=f)) +
    geom_line(aes(colour=draw, linetype=draw)) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, 
                  ymin=qnorm(0.025), ymax=qnorm(0.975)),
              fill="lightgrey", alpha=0.1) +
    guides(colour="none", linetype="none")

# posterior
x <- f.points$x
f <- f.points$y
post.mean <- K(x.star, x) %*% solve(K(x, x)) %*% f
post.var <- K(x.star, x.star) - K(x.star, x) %*% solve(K(x, x)) %*% K(x, x.star)
post.draws <- rmvnorm(3, post.mean, post.var, method="svd")
post.data <- data.frame(t(post.draws))
post.data$x <- x.star
post.data <- gather(post.data, draw, f, X1:X3) %>%
    group_by(draw) %>%
    mutate(ll=f - qnorm(0.975) * sqrt(diag(post.var)),
           ul=f + qnorm(0.975) * sqrt(diag(post.var)))

ggplot(post.data, aes(x=x, y=f)) +
    geom_line(aes(colour=draw)) +
    geom_point(data=f.points, aes(x, y), shape="+", size=6) +
    geom_ribbon(aes(ymin=ll, ymax=ul), fill="lightgrey", alpha=0.5) +
    guides(colour=FALSE)

Now we try the same procedure but assume noisy observations.

# libraries
library(ggplot2)
library(mvtnorm)
library(fields)
library(tidyr)
library(dplyr)
theme_set(theme_classic())

# posterior with noisy observations
# y = f(x) + eps
sigma2.n <- 0.2        # "known" variance of eps
# generate data
x.star <- seq(-5, 5, len=100)
x <- sample(x.star, 20)
y <- x * sin(x) + rnorm(length(x), 0, sqrt(sigma2.n))
f.points <- data.frame(x=x, y=y)

post.mean <- K(x.star, x) %*% (solve(K(x, x) + sigma2.n * diag(1, length(x)))) %*% y
post.var <- K(x.star, x.star) - K(x.star, x) %*% solve(K(x, x) + sigma2.n * diag(1, length(x))) %*% K(x, x.star)

# draws from f.star | X, y, X.star
post.draws <- rmvnorm(3, post.mean, post.var, method="svd")

# transform to data frame
post.data <- data.frame(t(post.draws))
post.data$x <- x.star
post.data <- gather(post.data, draw, f, X1:X3) %>%
    group_by(draw) %>%
    mutate(ll=f - qnorm(0.975) * sqrt(diag(post.var)),
           ul=f + qnorm(0.975) * sqrt(diag(post.var)))

ggplot(post.data, aes(x=x, y=f)) +
    geom_line(aes(colour=draw)) +
    geom_point(data=f.points, aes(x, y), shape="+", size=6) +
    geom_ribbon(aes(ymin=ll, ymax=ul), fill="lightgrey", alpha=0.5) +
    guides(colour=FALSE)

Here we implment Algorithm 2.1 for one test point

N <- 1000 # number of training points
X <- cbind(seq(-5, 5, len=N), seq(10, 12, len=N)) # training input
f <- function(x) sum(x ^ 2) # true function
sigma2.n <- 0.2 # known variance
y <- apply(X, 1, f) + rnorm(N, 0, sqrt(sigma2.n)) # targets
x.star <- cbind(3, 11)   # one target point

# algorithm
k.star <- K(X, x.star)
pi <- 3.14159

L <- chol(K(X) + sigma2.n * diag(1, nrow(X))) # R chol returns *upper triangular matrix*
alpha <- backsolve(L, forwardsolve(t(L), y))  # specify forward/backsolve (n^2/2 vs n^3 big Oh)
post.mean <- t(k.star) %*% alpha
v <- forwardsolve(t(L), k.star)
post.var <- K(x.star, x.star) - t(v) %*% v
log.marg.lik <- -0.5 * t(y) %*% alpha - sum(diag(L)) - N / 2 * log(2 * pi)

2.3 Varying the Hyperparameters

Example of varying the hyperparameters and selection between a few models based on log marginal likelihood.

# squared exponential with hyperparameters
K <- function(X.p, X.q, ell=1, sigma2.f=1, sigma2.n=0.1) {
    n <- ifelse(is.matrix(X.p), nrow(X.p), length(X.p))
    ell <- diag(1 / ell, n)
    k <- sigma2.f * exp(-0.5 * rdist(ell %*% X.p, ell %*% X.q) ^ 2) + diag(sigma2.n, n)
    return(k)
}

# generate data from GP with (l, sigma.f, sigma.n)=(1, 1, 0.1)
x <- sample(seq(-7.5, 7.5, len=1000), 20)
y <- as.vector(rmvnorm(1, mean=rep(0, length(x)), 
                       sigma=K(x, x, sigma2.n=0.1 ^ 2), method="svd"))

2.4 Decision Theory for Regression

2.5 An Example Application

# read in matrices
library(R.matlab)
sarcos.inv <- readMat("data/sarcos_inv.mat")$sarcos.inv
sarcos.inv.test <- readMat("data/sarcos_inv_test.mat")$sarcos.inv.test

2.6 Smoothing, Weight Functions and Equivalent Kernels

2.7 Incorporating Explicit Basis Functions

Exercises

Replicate the generation of random functions from Figure 2.2. Use a regular (or random) grid of scalar inputs and the covariance function from eq. (2.16). Hints on how to generate random samples from multi-variate Gaussian distributions are given in section A.2. Invent some training data points, and make random draws from the resulting GP posterior using eq. (2.19).

# libraries
library(ggplot2)
library(mvtnorm)
library(fields)
library(tidyr)
theme_set(theme_classic())

# data
f.points <- data.frame(x=c(-4,-3,-1,0,2), y=c(-2,0,1,2,-1))
ggplot(f.points, aes(x, y)) +
    geom_point(shape="+", size=6)

# K function
K <- function(X.p, X.q=NULL) {
    sigma <- exp(-0.5 * rdist(X.p, X.q) ^ 2)
    return(sigma)
}

# prior
set.seed(1)
N <- 500 # control fineness
x.star <- seq(-5, 5, len=N) # grid
f.star <- rmvnorm(3, mean=rep(0, N), sigma=K(x.star), method="svd")

# plot draws from prior
data <- data.frame(t(f.star))
data$x <- x.star
data <- gather(data, draw, f, X1:X3)

ggplot(data, aes(x=x, y=f)) +
    geom_line(aes(colour=draw, linetype=draw)) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, 
                  ymin=qnorm(0.025), ymax=qnorm(0.975)),
              fill="lightgrey", alpha=0.1)

# posterior
x.star <- seq(-5, 5, len=N) # grid
x <- f.points$x
f <- f.points$y
post.mean <- K(x.star, x) %*% solve(K(x, x)) %*% f
post.var <- K(x.star, x.star) - K(x.star, x) %*% solve(K(x, x)) %*% K(x, x.star)
post.draws <- rmvnorm(10, post.mean, post.var, method="svd")
post.data <- data.frame(t(post.draws))
post.data$x <- x.star
post.data <- gather(post.data, draw, f, X1:X10)

ggplot(post.data, aes(x=x, y=f)) +
    geom_line(aes(colour=draw)) +
    geom_point(data=f.points, aes(x, y), shape="+", size=6) +
    guides(colour=FALSE)

In eq. (2.11) we saw that the predictive variance at $\mathbf{x}$ under the feature space regression model was var$(f(\mathbf{x}_)=\phi(\mathbf{x})^TA^{-1}\phi(\mathbf{x}_)$. Show that cov$(f(\mathbf{x}),f(\mathbf{x}_'))=\phi(\mathbf{x})^TA^{-1}\phi(\mathbf{x}_')$. Check that this is compatible with the expression given in eq. (2.24).
The Wiener process is defined for $x\geq0$ and has $f(0) = 0$. (See section B.2.1 for further details.) It has mean zero and a non-stationary covariance function $k(x, x')$ = min$(x, x')$. If we condition on the Wiener process passing through $f(1) = 0$ we obtain a process known as the Brownian bridge (or tied-down Wiener process). Show that this process has covariance $k(x,x') = \min(x,x')-xx'$ for $0 \leq x,x' \leq 1$ and mean 0. Write a computer program to draw samples from this process at a finite grid of $x$ points in [0, 1].
Let $\text{var}n(f(\mathbf{x}))$ be the predictive variance of a Gaussian process regression model at $\mathbf{x}_$ given a dataset of size $n$. The corresponding predictive variance using a dataset of only the first $n-1$ training points is denoted $\text{var}{n-1}(f(\mathbf{x}))$. Show that $\text{var}n(f(\mathbf{x})) \leq \text{var}{n-1}(f(\mathbf{x}))$, i.e. that the predictive variance at $\mathbf{x}_$ cannot increase as more training data is obtained. One way to approach this problem is to use the partitioned matrix equations given in section A.3 to decompose $\text{var}n(f(\mathbf{x})) = k(\mathbf{x}_, \mathbf{x}) - \mathbf{k}^T_ (K+\sigma_n^2I)^{-1}\mathbf{k}*$.