In nategarton13/sparseRGPs: Fit Sparse Gaussian Processes

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

First we will simulate and visualize some data from a 2D regression.

## uncomment the following line if you used devtools::install_github("nategarton13/sparseRGPs")
library(sparseRGPs)

## comment out the following line if you used devtools::install_github("nategarton13/sparseRGPs")
# devtools::load_all()
library(ggplot2)

## locations
set.seed(1116)
xy <- matrix(data = runif(n = 400, min = 0, max = 10), ncol = 2)

## make covariance matrix
cov_par_gp <- list("sigma" = 3, "l" = 3, "tau" = 0)
Sigma <- make_cov_matC(x = xy, x_pred = matrix(),
                       cov_par = cov_par_gp,
                       cov_fun = "sqexp", delta = 1e-6)

## sample the true latent GP
mu <- rep(0, times = nrow(xy))
ff <- as.numeric(mvtnorm::rmvnorm(n = 1, mean = mu, sigma = Sigma))

## sample the data
tau <- 1
y <- ff + rnorm(n = length(ff), mean = 0, sd = tau)

## plot the real observations
ggplot() +
  geom_point(mapping = aes(x = xy[,1], y = xy[,2], colour = y))

Let's fit a sparse GP using one-at-a-time knot selection using the log-likelihood as the objective function.

###############################################
## fit the sparse GP model using OAT
###############################################

## initialize covariance parameters
cov_par_start <- list("sigma" = 3, "l" = 3, "tau" = 1)

## initial knots
xu_init <- matrix(data = runif(n = 10, min = 0, max = 10), ncol = 2)

## maximum number of knots
maxknot <- 20

## number of candidate knot proposals explored
TTmax <- maxknot

## minimum number of candidate knot proposals to explore
TTmin <- 5

## maximum number of gradient ascent iterations
maxit <- 500

## objective function tolerance
obj_tol <- 1e-4

## initial step size parameter in ADADELTA
epsilon <- 1e-4

## Fit the sparse GP
## Should take around a minute
system.time(gp_fit_fic <- optimize_gp(y = y, # response/target values
                                  xy = xy, # matrix of observed input locations
                      cov_fun = "sqexp", # covariance function
                      cov_par_start = cov_par_start, # initial covariance parameter values
                      mu = mu, # marginal means of the GP at xy
                      family = "gaussian", # conditional distribution of Y|f
                      nugget = TRUE, # do you want to estimate the noise variance/nugget?
                      sparse = TRUE, # do you want to estimate a sparse model?
                      xu_opt = "oat", # method of knot selection
                      xu = xu_init, # initial knots
                      muu = rep(0, times = nrow(xu_init)), # marginal GP means at the knots
                      vi = FALSE, # use variational inference instead of maximum likelihood?
                      opt = list(maxknot = maxknot, # set algorithmic parameters
                                             TTmax = TTmax, 
                                             TTmin = TTmin,
                                             maxit = 10,
                                             obj_tol = obj_tol,
                                             epsilon = epsilon),
                      verbose = FALSE))

## look at estimated covariance parameters
gp_fit_fic$results$cov_par

## selected knots 
gp_fit_fic$results$xu


## plot the optimized objective function values for each added knot
plot(1:length(gp_fit_fic$results$obj_fun), gp_fit_fic$results$obj_fun)

## get predictions 
x_pred <- as.matrix(expand.grid(seq(from = 0, to = 10, by = 0.25), 
                                seq(from = 0, to = 10, by = 0.25)))
preds_fic <- predict_gp(mod = gp_fit_fic, 
                   x_pred = x_pred, 
                   mu_pred = rep(0, times = nrow(x_pred)), 
                   full_cov = FALSE, 
                   vi = FALSE)

## plot the real observations
ggplot() +
  geom_point(mapping = aes(x = xy[,1], y = xy[,2], colour = y))

## plot predictions over a fine grid
ggplot() +
  geom_tile(mapping = aes(x = x_pred[,1], y = x_pred[,2], 
                          fill = preds_fic$pred$pred_mean))


## plot the knots
ggplot() +
  geom_point(mapping = aes(x = gp_fit_fic$results$xu[,1], y = gp_fit_fic$results$xu[,2])) +
  geom_point(mapping = aes(x = gp_fit_fic$xu_init[,1], y = gp_fit_fic$xu_init[,2]), 
             colour = "blue", shape = 2, size = 2)

Now we fit a sparse GP using the one-at-a-time knot selection algorithm but using variational inference and the evidence lower bound (ELBO) as the objective function.

## fit a sparse GP using a variational approximation
##  This should take 5-6 minutes
system.time(gp_fit_vi <- optimize_gp(y = y, 
                                     xy = xy,
                           cov_fun = "sqexp",
                           cov_par_start = cov_par_start,
                           mu = mu,
                           family = "gaussian",
                           nugget = TRUE,
                           sparse = TRUE,
                           xu_opt = "oat",
                           xu = xu_init,
                           muu = rep(0, times = nrow(xu_init)),
                           vi = TRUE, 
                           opt = list(maxknot = maxknot,
                                             TTmax = TTmax,
                                             TTmin = TTmin,
                                             maxit = maxit,
                                             obj_tol = obj_tol,
                                             epsilon = epsilon),
                           verbose = FALSE))

gp_fit_vi$results$cov_par

## get predictions from a full GP
preds_vi <- predict_gp(mod = gp_fit_vi, 
                        x_pred = x_pred, 
                        mu_pred = rep(0, times = nrow(x_pred)), 
                        full_cov = FALSE, 
                        vi = TRUE)

## plot the real observations
ggplot() +
  geom_point(mapping = aes(x = xy[,1], y = xy[,2], colour = y))

## plot the predictions
#sparse vi model
vi_plot <- ggplot() +
  geom_tile(mapping = aes(x = x_pred[,1], y = x_pred[,2], fill = preds_vi$pred$pred_mean))

Now we estimate the full GP and compare the predictions made from each model in a scatterplot matrix.

## fit a full GP
##  This should take 1-2 minutes
system.time(gp_fit_full <- optimize_gp(y = y, 
                                       xy = xy,
                           cov_fun = "sqexp",
                           cov_par_start = cov_par_start,
                           mu = mu,
                           family = "gaussian",
                           nugget = TRUE,
                           sparse = FALSE,
                           vi = FALSE, 
                           opt = list(
                             maxit = maxit,
                             obj_tol = obj_tol,
                             epsilon = epsilon),
                           verbose = FALSE))

gp_fit_full$results$cov_par

## get predictions from a full GP
preds_full <- predict_gp(mod = gp_fit_full, 
                        x_pred = x_pred, 
                        mu_pred = rep(0, times = nrow(x_pred)), 
                        full_cov = FALSE, 
                        vi = FALSE)

## plot the real observations
ggplot() +
  geom_point(mapping = aes(x = xy[,1], y = xy[,2], colour = y))

## plot the predictions
#full
full_plot <- ggplot() +
  geom_tile(mapping = aes(x = x_pred[,1], y = x_pred[,2], fill = preds_full$pred$pred_mean))

## plot predictions against each other to see how they match
pairs(cbind(preds_full$pred$pred_mean, 
            preds_fic$pred$pred_mean,
            preds_vi$pred$pred_mean))