gaussianProcess: Gaussian Process Fitting and Prediction

Documented in create.gaussian.process fit.hyperparams get.alpha get.cov.mat.chol get.marginal.likelihood get.marginal.likelihood.grad get.marginal.likelihood.hessian plot.GaussianProcess predict.GaussianProcess sample.functions.from.kernel

GaussianProcess_class_name <- "GaussianProcess"
sigma.n.name <- "sigma.n"

# By default generate up to 10^6, so that if we square the number we can
# still add on a small sigma_n and not have it round off
rplaw <- function(n, alpha, x_0=1, x_1=10^6) {
  alpha1 <- alpha + 1
  ((x_1^alpha1 - x_0^alpha1) * runif(n) + x_0^alpha1)^(1/alpha1)
}


#' Create a gaussianProcess object
#'
#' This function creates a gaussianProcess object along with associated functions for fitting hyperparameters.
#'
#' Intend to rework this to fit the R S3 framework - it currently encloses all its methods to allow caching in
#' optimx calls, but with a bit more experience I think this isn't actually necessary.
#'
#' @param x A matrix or data frame of predictor variables
#' @param y A numeric vector of response variables
#' @param kernel a Kernel object specifying the kernel for the Gaussian process
#' @param cache a Cache object. If NULL, a cache is created
#'
#' @return An untrained gaussianProcess object
#' @export
#'
#' @importFrom optimx optimx
#' @importFrom cacheMan create_cache hash_object
#'
#' @examples
#' x <- rnorm(50)
#' y <- sin(1/(x^2 + 0.15))
#' mt <- create.model.tree.builtin()
#' mt <- insert.kernel.instance(mt, 1, "squaredExponential", NULL, hyper.params=c(l=NULL))
#' k <- create.kernel.object.from.model.tree(mt)
#' gp <- create.gaussian.process(x, y, k)
#' gp <- fit.hyperparams(gp)
#'
create.gaussian.process <- function(x, y, kernel, cache=NULL) {
  gp.obj <- list()
  class(gp.obj) <- GaussianProcess_class_name

  num.training.points <- nrow(x)
  if (is.null(num.training.points)) {
    num.training.points <- length(x)
    x <- matrix(x, nrow=num.training.points)
  }
  if (!is.numeric(num.training.points)) {
    stop("x does not appear to be a matrix or a vector")
  }

  if (class(kernel) != Kernel_class_name) {
    stop("invalid kernel specified")
  }

  gp.obj$training.points <- x
  gp.obj$num.training.points <- num.training.points
  gp.obj$training.point.values <- y
  gp.obj$kernel <- kernel

  if (is.null(cache)) {
    gp.obj$cache <- create_cache()
  } else {
    gp.obj$cache <- cache
  }

  attr(gp.obj$training.points, cacheMan:::hash_attr) <-
    hash_object(gp.obj$training.points, gp.obj$cache)

  attr(gp.obj$training.point.values, cacheMan:::hash_attr) <-
    hash_object(gp.obj$training.point.values, gp.obj$cache)

  gp.obj$kernel <- set.kernel.hash(gp.obj$kernel, cache)

  return(gp.obj)
}

check.hyperparams <- function(gp, hyper.params) {
  if (class(gp) != GaussianProcess_class_name) {
    stop("Invalid Gaussian process supplied.")
  }
  k.hp.names <- gp$kernel$hyperparam_names
  if (length(hyper.params) != length(k.hp.names)) {
    print("hyper.params")
    print(hyper.params)
    print("k.hp.names")
    print(k.hp.names)
    stop(paste("Number of hyperparameters supplied does not match kernel of GP:", length(hyper.params), "vs", length(k.hp.names)))
  }
  if (!is.null(names(hyper.params))) {
    if (any(!names(hyper.params) %in% k.hp.names)) {
      stop("Named vector of hyperparameters supplied does not match hyperparameter names of kernel")
    }
    hyper.params <- hyper.params[k.hp.names]
  } else {
    warning("Unnamed vector of hyperparams supplied.")
    names(hyper.params) <- k.hp.names
  }
  return(hyper.params)
}

#' Get the Cholesky decomposition of a covariance matrix
#'
#' @return the Cholesky decomposition of a covariance matrix
#'
#' @importFrom cacheMan cached_call
get.cov.mat.chol <- function(kernel,
                             x,
                             sigma.n,
                             hyper.params,
                             cache
) {

  cov.mat <- cached_call(get_covariance_matrix,
                         kernel=kernel,
                         x=x,
                         sigma.n=sigma.n,
                         hyper.params=hyper.params,
                         cache=cache)

  cov.mat.chol <- cached_call(chol,
                              x=cov.mat,
                              cache=cache)

  return(cov.mat.chol)
}

#' Get alpha for a GP
#'
#' Internal shenanigans. This documentation only exists so I can tell Roxygen
#' what to import for it.
#'
#' @param gp a GaussianProcess object
#'
#' @return a vector
#'
#' @importFrom cacheMan cached_call
get.alpha <- function(kernel,
                      training.points,
                      training.point.values,
                      sigma.n,
                      hyper.params,
                      cache) {

  cov.mat.chol <- cached_call(get.cov.mat.chol,
                              kernel=kernel,
                              x=training.points,
                              sigma.n=sigma.n,
                              hyper.params=hyper.params,
                              cache=cache
                              )

  cov.mat.inv <- cached_call(chol2inv,
                             cov.mat.chol,
                             cache=cache)

  y <- training.point.values

  alpha <- cov.mat.inv %*% y

  return(alpha)
}

#' Get Log Marginal Likelihood of a GP
#'
#' @param gp a GaussianProcess object
#' @param sigma.n a numeric value for sigma.n
#' @param hyper.params a numeric vector of hyperparameters
#'
#' @return the log marginal likelihood of the GP at the given hyperparameters
#'
#' @importFrom cacheMan cached_call
get.marginal.likelihood <- function(gp, sigma.n, hyper.params) {
  hyper.params <- check.hyperparams(gp, hyper.params)

  cov.mat.chol <- cached_call(get.cov.mat.chol,
                              kernel=gp$kernel,
                              x=gp$training.points,
                              sigma.n=sigma.n,
                              hyper.params=hyper.params,
                              cache=gp$cache)

  cov.mat.det.log <- 2 * sum(log(diag(cov.mat.chol)))

  R <- cov.mat.chol

  y <- gp$training.point.values

  z <- cached_call(forwardsolve,
                   t(R), y,
                   cache=gp$cache)

  x <- cached_call(backsolve,
                   R, z,
                   cache=gp$cache)

  log.marginal.likelihood <- as.numeric( - 1/2 * t(y) %*% x
                                         - 1/2 * cov.mat.det.log
                                         - nrow(gp$training.points)
                                           * log(2 * pi) / 2)

  return(log.marginal.likelihood)
}

#' Get the Grad of the Log Marginal Likelihood of a GP
#'
#' Returns the grad of the log marginal likelihood of a GP w.r.t. the kernel's
#' hyperparameters.
#'
#' @param gp a GaussianProcess object
#' @param sigma.n a numeric value for sigma.n
#' @param hyper.params a numeric vector of hyperparameters
#'
#' @return
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
get.marginal.likelihood.grad <- function(gp, sigma.n, hyper.params) {
  hyper.params <- check.hyperparams(gp, hyper.params)

  cov.mat.grad.array <- cached_call(get_covariance_matrix_grad,
                                    kernel=gp$kernel,
                                    x=gp$training.points,
                                    sigma.n=sigma.n,
                                    hyper.params=hyper.params,
                                    cache=gp$cache)

  log.marginal.likelihood.grad <- numeric(length(hyper.params) + 1)
  names(log.marginal.likelihood.grad) <- c(sigma.n.name, names(hyper.params))

  cov.mat.chol <- cached_call(get.cov.mat.chol,
                              kernel=gp$kernel,
                              x=gp$training.points,
                              sigma.n=sigma.n,
                              hyper.params=hyper.params,
                              cache=gp$cache)

  cov.mat.inv <- cached_call(chol2inv,
                             cov.mat.chol,
                             cache=gp$cache)

  alpha <- cached_call(get.alpha,
                       kernel=gp$kernel,
                       training.points=gp$training.points,
                       training.point.values=gp$training.point.values,
                       sigma.n=sigma.n,
                       hyper.params=hyper.params,
                       cache=gp$cache)

  # The * rather than %*% in what follows is not a mistake - tr(t(A)%*%B) is
  # the sum of the element-wise product of A and B.
  # Maybe replace the sum() with kahan's summation or something equivalent.
  # Can't find details of R's implementation of sum().
  temp.multiplicand <- t((alpha %*% t(alpha) - cov.mat.inv))

  temp.multiplicand <- array(rep(temp.multiplicand, length(hyper.params) + 1),
                             dim(cov.mat.grad.array))

  log.marginal.likelihood.grad <- colSums(1/2 * temp.multiplicand * cov.mat.grad.array, dims=2)

  return(log.marginal.likelihood.grad)
}


#' Get the Hessian of the Log Marginal Likelihood of a GP
#'
#' Returns the Hessian matrix of the log marginal likelihood of a GP w.r.t. the kernel's
#' hyperparameters.
#'
#' @param gp a GaussianProcess object
#' @param sigma.n a numeric value for sigma.n
#' @param hyper.params a numeric vector of hyperparameters
#'
#' @return
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
get.marginal.likelihood.hessian <- function(gp, sigma.n, hyper.params) {
  hyper.params <- check.hyperparams(gp, hyper.params)

  num.hps <- length(hyper.params) + 1

  cov.mat.grad.array <- cached_call(get_covariance_matrix_grad,
                                    kernel=gp$kernel,
                                    x=gp$training.points,
                                    sigma.n=sigma.n,
                                    hyper.params=hyper.params,
                                    cache=gp$cache)

  cov.mat.hess.array <- cached_call(get_covariance_matrix_hess,
                                    kernel=gp$kernel,
                                    x=gp$training.points,
                                    sigma.n=sigma.n,
                                    hyper.params=hyper.params,
                                    cache=gp$cache)

  log.marginal.likelihood.hess <- matrix(0,
                                         nrow=num.hps,
                                         ncol=num.hps)

  colnames(log.marginal.likelihood.hess) <- c(sigma.n.name, names(hyper.params))
  rownames(log.marginal.likelihood.hess) <- c(sigma.n.name, names(hyper.params))

  cov.mat.chol <- cached_call(get.cov.mat.chol,
                              kernel=gp$kernel,
                              x=gp$training.points,
                              sigma.n=sigma.n,
                              hyper.params=hyper.params,
                              cache=gp$cache)

  cov.mat.inv <- cached_call(chol2inv,
                             cov.mat.chol,
                             cache=gp$cache)

  alpha <- cached_call(get.alpha,
                       kernel=gp$kernel,
                       training.points=gp$training.points,
                       training.point.values=gp$training.point.values,
                       sigma.n=sigma.n,
                       hyper.params=hyper.params,
                       cache=gp$cache)


  grad_k_alpha <- matrix(0, nrow=length(alpha), ncol=num.hps)
  grad.k.k.inv <- cov.mat.grad.array
  for (i in 1:num.hps) {
    grad_k_alpha[, i] <- cov.mat.grad.array[, , i] %*% alpha
    grad.k.k.inv[, , i] <- cov.mat.grad.array[, , i] %*% cov.mat.inv
  }

  U <- alpha %*% t(alpha) - cov.mat.inv

  for (i in 1:num.hps) {
    for (j in i:num.hps) {
      L <- sum(cov.mat.inv * (grad_k_alpha[, i] %*% t(grad_k_alpha[, j])))
      M <- sum(t(grad.k.k.inv[, , i]) * grad.k.k.inv[, , j])
      N <- sum(U * cov.mat.hess.array[, , i, j])
      log.marginal.likelihood.hess[i, j] <-
        log.marginal.likelihood.hess[j, i] <- 1/2 * (N + M) - L
    }
  }

  #V <- cov.mat.hess.array
  #
  #for (i in 1:num.hps) {
  #  for (j in 1:num.hps) {
  #    V[, , i, j] <- grad_k_alpha[, i] %*% t(grad_k_alpha[, j])
  #  }
  #}
  # The * rather than %*% in what follows is not a mistake - tr(t(A)%*%B) is
  # the sum of the element-wise product of A and B.
  #temp.multiplicand <- t((alpha %*% t(alpha) - cov.mat.inv))

  #temp.multiplicand <- array(rep(temp.multiplicand, length(hyper.params) + 1),
  #                           dim(cov.mat.grad.array))

  #log.marginal.likelihood.grad <- colSums(1/2 * temp.multiplicand * cov.mat.grad.array, dims=2)

  return(log.marginal.likelihood.hess)
}

get.marginal.likelihood.optimx <- function(par, gp, prior=get.uniform.prior()) {
  -get.marginal.likelihood(gp=gp, sigma.n=par[1], hyper.params=par[-1]) - prior$log.prior(par)
}

get.marginal.likelihood.grad.optimx <- function(par, gp, prior=get.uniform.prior()) {
  -get.marginal.likelihood.grad(gp=gp, sigma.n=par[1], hyper.params=par[-1]) - prior$log.prior.grad(par)
}

optimize.gp.params <- function(gp,
                               init.params,
                               optimx.method,
                               max.iterations,
                               lower,
                               optimx.starttests,
                               optimx.trace,
                               prior=get.uniform.prior()) {
  return(optimx(init.params, get.marginal.likelihood.optimx,
                gr=get.marginal.likelihood.grad.optimx,
                method=optimx.method,
                itnmax=max.iterations,
                lower=lower,
                control=list(starttests=optimx.starttests,
                             trace=optimx.trace,
                             kkt=FALSE),
                gp=gp,
                prior=prior))
}

#' Fit hyperparameters for a GP using maximum likelihood optimisation
#'
#' @param gp a GaussianProcess object
#' @param sigma.n.init an initial value for sigma.n, or NA for random
#' initialisation
#' @param hyper.params.init a vector of initial values for the kernel
#' hyperparameters, or NA for random initialisation. Partial random starts
#' can be specified by setting only some entries in the vector to be NA, or
#' a complete random start can be specified by passing a single NA value.
#' @param random.init.gridsize the number of random initialisation points to
#' test
#' @param resample.top.inits boolean indicating whether to do a second random
#' search based on the (smoothed) distribution of the top results in the first
#' pass
#' @param resample.from number of points to build the distribution from in the
#' second random search
#' @param num.resamples number of samples for the second search
#' @param additional.optimx.runs the number of additional start points to attempt
#' optimisation from
#' @param additional.par.perc.diff the percentage difference required between
#' any two starting parameter vectors
#' @param random.init.scale a scaling factor for the random generation of
#' initialisation points
#' @param abs.min.sigma.n minimum value of sigma.n
#' @param abs.min.hyper.params a vector of minimum values of the hyper params
#' @param optimx.method the optimx method to use for optimisation
#' @param max.iterations max iterations of each optimx run
#' @param optimx.starttests run optimx start tests
#' @param optimx.trace optimx trace level
#' @param verbose verbose output to screen
#'
#' @return a trained GaussianProcess object
#' @export
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
fit.hyperparams <- function(gp,
                            sigma.n.init=NA,
                            hyper.params.init=NA,
                            prior=get.uniform.prior(),
                            random.init.gridsize=50,
                            resample.top.inits=TRUE,
                            resample.from=NULL,
                            num.resamples=NULL,
                            additional.optimx.runs=1,
                            additional.par.perc.diff=0.1,
                            random.init.scale=1,
                            abs.min.sigma.n=0.001,
                            abs.min.hyper.params=0,
                            optimx.method="L-BFGS-B", max.iterations=10000,
                            optimx.starttests=FALSE, optimx.trace=0,
                            verbose=FALSE) {
  if (length(hyper.params.init) == 1 & is.na(hyper.params.init)){
    hyper.params.init <- rep(NA, length(gp$kernel$hyperparam_names))
    names(hyper.params.init) <- gp$kernel$hyperparam_names
  }

  hyper.params.init <- check.hyperparams(gp, hyper.params.init)

  # generate random.init.gridsize random pars and find the best
  best.par <- NULL
  best.log.likelihood <- -Inf

  par.mat <- matrix(0, nrow=random.init.gridsize, ncol=length(hyper.params.init) + 1)
  colnames(par.mat) <- c("sigma.n", names(hyper.params.init))
  log.likelihood.vec <- rep(-Inf, random.init.gridsize)

  successful_init_params <- 0

  for (i in 1:random.init.gridsize) {
    if (is.na(sigma.n.init)) {
      sigma.n.temp <- abs(rcauchy(n=1, scale=random.init.scale)) + abs.min.sigma.n
    } else {
      sigma.n.temp <- sigma.n.init
    }
    if (length(hyper.params.init) > 0) {
      par <- c(sigma.n.temp, sapply(hyper.params.init, function(x) {
        if (is.na(x)) {
          randnum <- rcauchy(n=1, scale=random.init.scale)
          return(sign(randnum) * (abs(randnum) + abs.min.hyper.params))
        } else {
          return(x)
        }
      }
      )
      )
    } else {
      par <- sigma.n.temp
    }
    names(par) <- c("sigma.n", names(hyper.params.init))

    # We're going to pick some crazy hyperparameters because we're randomly generating
    # them, and some of them are going to break the kernels (so far mainly by causing round-off
    # errors between the kernel and sigma_n). To avoid this, we're going to convert errors into
    # warnings, pretend the bad parameters never happened, and carry on with our day.
    tryCatch({
      log.likelihood.temp <- -get.marginal.likelihood.optimx(par=par, gp=gp, prior=prior)

      par.mat[i, ] <- par
      log.likelihood.vec[i] <- log.likelihood.temp

      if (log.likelihood.temp > best.log.likelihood | is.null(best.log.likelihood)) {
        best.log.likelihood <- log.likelihood.temp
        best.par <- par
      }
      successful_init_params <- successful_init_params + 1
    },
    error=function(e) {
      warning(paste("Bad random hyperparams encountered. Error:", e))
    }
    )
  }

  if (successful_init_params == 0 & random.init.gridsize > 0) {
    stop("No initial parameters successfully trained a GP. Check warnings for suppressed error messages")
  }

  if (verbose) print("Best pars found:")
  if (verbose) print(best.par)
  if (verbose) print(paste("Log likelihood:", best.log.likelihood))

  if (resample.top.inits) {
    if (is.null(resample.from)) {
      resample.from <- ceiling(random.init.gridsize * 0.1)
    }
    if (is.null(num.resamples)) {
      num.resamples <- random.init.gridsize
    }
    if (verbose) print(paste("Resampling based on top", resample.from, "candidate starting hyperparameters."))
    top.n.pars <- par.mat[order(log.likelihood.vec, decreasing=T)[1:resample.from], , drop=FALSE]

    if (verbose) print("Summary of top n hyperparam starting values:")
    if (verbose) print(summary(top.n.pars))

    resampled.par.mat <- matrix(0, nrow=num.resamples, ncol=length(hyper.params.init) + 1)
    resampled.log.likelihood.vec <- rep(-Inf, num.resamples)

    for (col in 1:ncol(top.n.pars)) {
      # Find a suitable bandwidth and then use this to sample from the
      # distribution of the top n for this par.
      if (sd(top.n.pars[, col]) != 0) {
        tryCatch({
          bw <- bw.SJ(top.n.pars[, col])
        },
        error=function(e) {
          warning(e)
          bw <- bw.nrd0(top.n.pars[, col])
        })
      } else {
        bw <- 10^-8
      }
      resampled.par.mat[, col] <- rnorm(num.resamples,
                                        sample(top.n.pars[, col],
                                               size = num.resamples,
                                               replace = TRUE),
                                        bw)
    }

    if (verbose) print(paste("Taking", num.resamples, "resamples of top n starting values."))
    for (i in 1:num.resamples) {
      par <- resampled.par.mat[i, ]
      names(par) <- c("sigma.n", names(hyper.params.init))
      # We're going to pick some crazy hyperparameters because we're randomly generating
      # them, and some of them are going to break the kernels (so far mainly by causing round-off
      # errors between the kernel and sigma_n). To avoid this, we're going to convert errors into
      # warnings, pretend the bad parameters never happened, and carry on with our day.
      tryCatch({
        log.likelihood.temp <- -get.marginal.likelihood.optimx(par=par, gp=gp, prior=prior)

        resampled.log.likelihood.vec[i] <- log.likelihood.temp

        if (log.likelihood.temp > best.log.likelihood | is.null(best.log.likelihood)) {
          best.log.likelihood <- log.likelihood.temp
          best.par <- par
        }
      },
      error=function(e) {
        warning(paste("Bad random hyperparams encountered. Error:", e))
      }
      )
    }
    if (verbose) print("Best pars found:")
    if (verbose) print(best.par)
    if (verbose) print(paste("Log likelihood:", best.log.likelihood))
    par.mat <- rbind(par.mat, resampled.par.mat)
    log.likelihood.vec <- c(log.likelihood.vec, resampled.log.likelihood.vec)
  }
  lower <- -Inf
  if (!is.null(abs.min.sigma.n) | abs.min.sigma.n != 0) {
    lower <- rep(-Inf, length(best.par))
    lower[1] <- abs.min.sigma.n
  }
  optimx.obj <- optimize.gp.params(gp,
                                   best.par,
                                   optimx.method,
                                   max.iterations,
                                   lower,
                                   optimx.starttests,
                                   optimx.trace,
                                   prior=prior)

  if (optimx.obj["convcode"] > 0) {
    warning("Did not converge!")
    best.log.likelihood <- -Inf
  } else {
    opt.pars <- as.numeric(optimx.obj)[1:(length(hyper.params.init) + 1)]
    names(opt.pars) <- c("sigma.n", names(hyper.params.init))
    best.log.likelihood <- -get.marginal.likelihood.optimx(par=opt.pars, gp=gp, prior=prior)
  }

  tested.par.indices <- c(1)
  candidate.par.index <- 1
  par.order <- order(log.likelihood.vec, decreasing=T)

  for (i in seq_len(additional.optimx.runs)) {
    # Take the next set of pars which is sufficiently different from the others tried
    par.diff <- 0
    while (candidate.par.index < nrow(par.mat) & par.diff < additional.par.perc.diff) {
      candidate.par.index <- candidate.par.index + 1
      new.par <- par.mat[par.order[candidate.par.index], ]
      par.diff <- min(apply(par.mat[tested.par.indices, , drop=FALSE],
                            1,
                            function(row) sqrt(sum((new.par - row)^2)) / sqrt(sum((row)^2))
      ))
    }

    if (candidate.par.index >= nrow(par.mat) | log.likelihood.vec[candidate.par.index] == -Inf) {
      # We've not got a sufficiently different set of pars
      break
    }

    tested.par.indices <- c(tested.par.indices, candidate.par.index)

    new.optimx.obj <- optimize.gp.params(gp,
                                         new.par,
                                         optimx.method,
                                         max.iterations,
                                         lower,
                                         optimx.starttests,
                                         optimx.trace,
                                         prior=prior)

    if (new.optimx.obj["convcode"] > 0) {
      warning("Did not converge!")
    } else {
      trained.pars <- as.numeric(new.optimx.obj)[1:(length(hyper.params.init) + 1)]
      names(trained.pars) <- c("sigma.n", names(hyper.params.init))
      new.log.likelihood <- -get.marginal.likelihood.optimx(par=trained.pars, gp=gp, prior=prior)
      if (new.log.likelihood > best.log.likelihood) {
        best.log.likelihood <- new.log.likelihood
        opt.pars <- trained.pars
        optimx.obj <- new.optimx.obj
      }
    }

  }
  if (best.log.likelihood != -Inf) {
    gp$optimized.hyperparams <- opt.pars[-1]
    gp$optimized.sigma.n <- opt.pars[1]
    if (verbose) print(paste("Final.pars after optimisation:"))
    if (verbose) print(opt.pars)
    if (verbose) print(paste("Log likelihood for best starting par optimisation:", best.log.likelihood))
  }

  return(list(gp=gp, optimx.obj=optimx.obj))
}


#' Create Predictions Using a Gaussian Process
#'
#' Returns the posterior mean and variance of a set of data points under a given Gaussian process
#'
#' @param gp.obj A trained gaussianProcess object
#' @param data Data to predict
#'
#' @return a list containing named elements:
#' \itemize{
#'   \item \code{mean} - the predicted mean value for each data point in \code{data}.
#'   \item \code{var} - the variance about the predicted mean for each data point in \code{data}.
#' }
#' @export
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
#' x <- rnorm(50)
#' y <- sin(1/(x^2 + 0.15))
#' mt <- create.model.tree.builtin()
#' mt <- insert.kernel.instance(mt, 1, "SE", NULL, hyper.params=c(l=1))
#' gp <- create.gaussian.process(x, y, mt)
#' gp$fit.hyperparams(NA)
#'
#' x1 <- rnorm(50)
#' y1.predicted <- predict(gp, x1)$mean
#'
predict.GaussianProcess <- function(gp.obj, data) {
  if (!"optimized.hyperparams" %in% names(gp.obj)) {
    stop("No optimized hyperparameters found. This GP has not yet been trained.")
  }

  if (!is.matrix(data)) {
    data <- matrix(data, nrow=length(data))
  }

  num.data.points <- nrow(data)
  kernel.func.list <- list()
  K.star.inst.list <- list()

  K.star <- get_kstar_matrix(kernel=gp.obj$kernel,
                             data.to.predict=data,
                             training.data=gp.obj$training.points,
                             hyper.params=gp.obj$optimized.hyperparams)

  alpha <- cached_call(get.alpha,
                       kernel=gp.obj$kernel,
                       training.points=gp.obj$training.points,
                       training.point.values=gp.obj$training.point.values,
                       sigma.n=gp.obj$optimized.sigma.n,
                       hyper.params=gp.obj$optimized.hyperparams,
                       cache=gp.obj$cache)

  pred.out <- list()
  pred.out[["mean"]] <- K.star %*% alpha

  pred.out[["var"]] <- matrix(NA, nrow=num.data.points, ncol=1)

  cov.mat.chol <- cached_call(get.cov.mat.chol,
                              kernel=gp.obj$kernel,
                              x=gp.obj$training.points,
                              sigma.n=gp.obj$optimized.sigma.n,
                              hyper.params=gp.obj$optimized.hyperparams,
                              cache=gp.obj$cache)

  cov.mat.L.inv <- cached_call(solve,
                               t(cov.mat.chol),
                               cache=gp.obj$cache)

  for (i in seq(num.data.points)) {
    v <- cov.mat.L.inv %*% t(K.star[i, , drop=F])
    pred.out$var[i, 1] <- (get_covariance_matrix(kernel=gp.obj$kernel,
                                                 x=data[i, , drop=FALSE],
                                                 sigma.n=gp.obj$optimized.sigma.n,
                                                 hyper.params=gp.obj$optimized.hyperparams)
                           - t(v) %*% v)
  }

  return(pred.out)
}


#' Plot a Gaussian Process
#'
#' Plots the posterior mean and first three standard deviations of a one-dimensional Gaussian process.
#'
#' @param gp.obj A gaussianProcess object
#' @param xlim The limits of the x-axis in the plot. Defaults to the limits of the training data plus one-quarter of the range at either end.
#' @param num.points The number of data points to calculate for the plot
#' @param col.index if the predictor data is multi-dimensional, specify which dimension to plot. Other dimensions will be set to their mean.
#' @param ... Additional parameters to pass to \code{plot}
#'
#' @return NULL
#' @export
plot.GaussianProcess <- function(gp.obj, xlim=NULL, num.points=1000, col.index=1, ...) {
  if (is.null(dim(gp.obj$training.points))) {
    x.data <- gp.obj$training.points
  } else {
    x.data <- gp.obj$training.points[, col.index]
  }

  if (is.null(xlim)) {
    x.min <- min(x.data)
    x.max <- max(x.data)

    x.range <- x.max - x.min

    x.min <- x.min - x.range / 4
    x.max <- x.max + x.range / 4
  } else {
    x.min = xlim[1]
    x.max = xlim[2]
  }

  x.values <- seq(from=x.min, to=x.max, length.out=num.points)
  if (is.null(dim(gp.obj$training.points))) {
    preds <- predict(gp.obj, x.values)
  } else {
    input.data <- matrix(colMeans(gp.obj$training.points),
                         nrow=length(x.values),
                         ncol=ncol(gp.obj$training.points),
                         byrow=TRUE)
    input.data[, col.index] <- x.values
    preds <- predict(gp.obj, input.data)
  }

  lower.bounds <- preds$mean - 3*sqrt(preds$var)
  upper.bounds <- preds$mean + 3*sqrt(preds$var)

  lower.2sd <- preds$mean - 2*sqrt(preds$var)
  upper.2sd <- preds$mean + 2*sqrt(preds$var)

  lower.1sd <- preds$mean - 1*sqrt(preds$var)
  upper.1sd <- preds$mean + 1*sqrt(preds$var)


  lower.bound <- min(lower.bounds, gp.obj$training.point.values)
  upper.bound <- max(upper.bounds, gp.obj$training.point.values)

  if (is.na(lower.bound) | is.na(upper.bound)) {
    warning("Lower or Upper bound is NA")
  } else {
    plot(x.values, preds$mean, type="n", ylim=c(lower.bound, upper.bound), ...)


    polygon(c(rev(x.values), x.values), c(rev(lower.bounds), upper.bounds), col = 'grey80', border = NA)
    polygon(c(rev(x.values), x.values), c(rev(lower.2sd), upper.2sd), col = 'grey70', border = NA)
    polygon(c(rev(x.values), x.values), c(rev(lower.1sd), upper.1sd), col = 'grey60', border = NA)
    lines(x.values, preds$mean)

    points(x.data, gp.obj$training.point.values)
  }

}


#' Sample a function from a Gaussian Process Prior
#'
#' @param x Points to sample at
#' @param kernel Kernel of the Gaussian Process
#' @param sigma.n Standard deviation of the noise
#' @param hyper.params Hyperparameters of the kernel
#' @param num.functions The number of different functions to generate
#'
#' @return A numeric vector of values at the sampled points
#' @export
#'
#' @importFrom mnormt rmnorm
sample.functions.from.kernel <- function(x, kernel, sigma.n, hyper.params, num.functions=1) {
  cov.mat <- get_covariance_matrix(kernel, x, sigma.n, hyper.params)
  y <- rmnorm(n=num.functions, mean=rep(0, nrow(cov.mat)), cov.mat)
  return(y)
}