R/bic.R

Defines functions model.search posterior.laplace.approximation bayesian.information.criterion.EDoF.eigen bayesian.information.criterion.EDoF bayesian.information.criterion

Documented in bayesian.information.criterion bayesian.information.criterion.EDoF bayesian.information.criterion.EDoF.eigen model.search posterior.laplace.approximation

#' Calculate BIC of a GP
#'
#' @param gp.obj A trained gaussianProcess object
#'
#' @return The BIC value for the input GP object.
#' @export
#' @seealso \code{\link{model.search}} and \code{\link{gaussianProcess}}
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
#' x <- rnorm(50)
#' y <- sin(1/(x^2 + 0.15))
#' mt <- create.model.tree.builtin()
#' mt <- insert.kernel.instance(mt, 1, "SE", NULL, hyper.params=c(l=1))
#' gp <- create.gaussian.process(x, y, mt)
#' gp$fit.hyperparams(NA)
bayesian.information.criterion <- function(gp.obj, ...) {
  # Must be a trained GP
  log.L <- get.marginal.likelihood(gp=gp.obj,
                                   sigma.n=gp.obj$optimized.sigma.n,
                                   hyper.params=gp.obj$optimized.hyperparams)
  k <- length(gp.obj$optimized.hyperparams) + 1
  n <- nrow(gp.obj$training.points)
  return(-2 * log.L + k * log(n))
}

#' Calculate BIC of a GP (Effective DoF)
#'
#' Calculate the BIC using the effective degrees of freedom of the kernel,
#' rather than the number of free parameters
#'
#' @param gp.obj A trained gaussianProcess object
#'
#' @return The BIC value for the input GP object.
#' @export
#' @seealso \code{\link{model.search}} and \code{\link{gaussianProcess}}
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
#' x <- rnorm(50)
#' y <- sin(1/(x^2 + 0.15))
#' mt <- create.model.tree.builtin()
#' mt <- insert.kernel.instance(mt, 1, "SE", NULL, hyper.params=c(l=1))
#' gp <- create.gaussian.process(x, y, mt)
#' gp$fit.hyperparams(NA)
bayesian.information.criterion.EDoF <- function(gp.obj, ...) {
  # Must be a trained GP
  log.L <- get.marginal.likelihood(gp=gp.obj,
                                   sigma.n=gp.obj$optimized.sigma.n,
                                   hyper.params=gp.obj$optimized.hyperparams)


  cov.mat.chol <- cached_call(get.cov.mat.chol,
                              kernel=gp.obj$kernel,
                              x=gp.obj$training.points,
                              sigma.n=gp.obj$optimized.sigma.n,
                              hyper.params=gp.obj$optimized.hyperparams,
                              cache=gp.obj$cache)

  cov.mat.inv <- cached_call(chol2inv,
                             cov.mat.chol,
                             cache=gp.obj$cache)

  cov.mat.noisefree <- cached_call(get_covariance_matrix,
                                   kernel=gp.obj$kernel,
                                   x=gp.obj$training.points,
                                   sigma.n=0,
                                   hyper.params=gp.obj$optimized.hyperparams,
                                   cache=gp.obj$cache)


  # Trace of matrix product is sum of pointwise product
  k <- sum(cov.mat.inv * cov.mat.noisefree)
  n <- nrow(gp.obj$training.points)
  return(-2 * log.L + k * log(n))
}


#' Calculate BIC of a GP (Effective DoF, eigenvalue decomposition)
#'
#' Calculate the BIC using the effective degrees of freedom of the kernel,
#' rather than the number of free parameters
#'
#' @param gp.obj A trained gaussianProcess object
#'
#' @return The BIC value for the input GP object.
#' @export
#' @seealso \code{\link{model.search}} and \code{\link{gaussianProcess}}
#'
#' @importFrom cacheMan cached_call
#'
#' @examples
#' x <- rnorm(50)
#' y <- sin(1/(x^2 + 0.15))
#' mt <- create.model.tree.builtin()
#' mt <- insert.kernel.instance(mt, 1, "SE", NULL, hyper.params=c(l=1))
#' gp <- create.gaussian.process(x, y, mt)
#' gp$fit.hyperparams(NA)
bayesian.information.criterion.EDoF.eigen <- function(gp.obj, ...) {
  # Must be a trained GP
  log.L <- get.marginal.likelihood(gp=gp.obj,
                                   sigma.n=gp.obj$optimized.sigma.n,
                                   hyper.params=gp.obj$optimized.hyperparams)

  cov.mat.noisefree <- cached_call(get_covariance_matrix,
                                   kernel=gp.obj$kernel,
                                   x=gp.obj$training.points,
                                   sigma.n=0,
                                   hyper.params=gp.obj$optimized.hyperparams,
                                   cache=gp.obj$cache)


  # Trace of matrix product is sum of pointwise product
  eigenvals <- eigen(cov.mat.noisefree, symmetric=TRUE, only.values=TRUE)$values
  k <- sum(eigenvals / (eigenvals + gp.obj$optimized.sigma.n^2))
  n <- nrow(gp.obj$training.points)
  return(-2 * log.L + k * log(n))
}

#' Calculate Laplace Approximation of a GP
#'
#' Calculate the Laplace Approximation of the posterior log likelihood of a gp
#'
#' @param gp.obj A trained gaussianProcess object
#' @param prior The prior over the hyperparameters
#'
#' @return The approximate posterior log likelihood for the input GP object.
#' @export
#' @seealso \code{\link{model.search}} and \code{\link{gaussianProcess}}
posterior.laplace.approximation <- function(gp.obj, prior, ...) {
  # Must be a trained GP
  log.L <- get.marginal.likelihood(gp=gp.obj,
                                   sigma.n=gp.obj$optimized.sigma.n,
                                   hyper.params=gp.obj$optimized.hyperparams)

  H <- -get.marginal.likelihood.hessian(gp=gp.obj,
                                        sigma.n=gp.obj$optimized.sigma.n,
                                        hyper.params=gp.obj$optimized.hyperparams)

  log.prior.H <- -prior$log.prior.hessian(c(gp.obj$optimized.sigma.n, gp.obj$optimized.hyperparams))

  log.det.H <- as.numeric(determinant(H + log.prior.H, logarithm=TRUE)$modulus)

  return(-2 * log.L + log.det.H)
}


#' Perform Model Selection Using BIC
#'
#' @param x A matrix or data frame of predictors
#' @param y A numeric vector of responses
#' @param base.model.tree The model tree at which the search starts
#' @param plot.gp Whether to plot the gaussian processes encountered during the search. If ncol(x) > 1 this parameter is ignored and no plots are created.
#' @param reset.params By default the search starts with the optimum hyperparameter values found in the previous step in the search (with new hyperparameters fitted from random start points). Setting this to TRUE causes all hyperparameters to be randomly set at each step.
#' @param max.new.nodes The number of kernel instances to add to the base model.
#' @param revisit.kernels Whether to revisit previously-assessed kernels when deleting nodes from the current best candidate.
#' @param scoring.function The scoring function for the GPs in the search. Must take a trained gaussianProcess object as its only parameter, and return a scalar score. If \code{scoring.function} returns a numeric vector only the first entry will be used.
#' @param ... Additional parameters to be passed to gp.obj$fit.hyperparameters (see \code{\link{gaussianProcess}})
#'
#' @return The trained gaussianProcess object with the best BIC.
#' @export
#'
#' @importFrom cacheMan create_cache
#'
#' @examples
#' x <- rnorm(50)
#' y <- sin(1/(x^2 + 0.15))
#' mt <- create.model.tree.builtin()
#' gp <- model.search(x, y, mt)
model.search <- function(x,
                         y,
                         base.model.tree,
                         plot.gp=FALSE,
                         reset.params=FALSE,
                         max.new.nodes=3,
                         revisit.kernels=TRUE,
                         scoring.function=bayesian.information.criterion,
                         return.all.models=FALSE,
                         alternate.scoring.functions=list(),
                         ...) {
  model.score.vec <- numeric()

  evaluated.models <- character(0)

  cache <- create_cache()

  output <- list()

  if (return.all.models) {
    output$model.list <- list()
  }

  scoring.function.name <- as.character(substitute(scoring.function))

  scores <- data.frame(matrix(nrow=0, ncol=1 + length(alternate.scoring.functions)))
  colnames(scores) <- c(scoring.function.name, names(alternate.scoring.functions))

  if (nrow(base.model.tree$tree) != 0) {
    # There is a base model defined, so first evaluate that.
    base.model.tree <- reduce.to.canonical.tree(base.model.tree)
    k <- create.kernel.object.from.model.tree(base.model.tree)
    gp.obj <- create.gaussian.process(x, y, k, cache)

    result <- fit.hyperparams(gp.obj, ...)
    gp.obj <- result$gp
    model.name <- as.character(gp.obj$kernel$kernel)

    model.score.vec[model.name] <- scoring.function(gp.obj, ...)

    scores[model.name, scoring.function.name] <- model.score.vec[model.name]

    for (i in names(alternate.scoring.functions)) {
      scores[model.name, i] <- alternate.scoring.functions[[i]](gp.obj, ...)
    }

    if (return.all.models) {
      output$model.list[[model.name]] <- gp.obj
    }

    evaluated.models <- c(evaluated.models, as.character(gp.obj$kernel$kernel))
    print(paste("Base model:", as.character(gp.obj$kernel$kernel)))
    print(paste("Base model Score:", model.score.vec[as.character(gp.obj$kernel$kernel)]))

    best.score <- model.score.vec[as.character(gp.obj$kernel$kernel)]
    best.gp <- gp.obj
    best.model <- gp.obj$kernel$kernel
    if (plot.gp) plot(gp.obj)

  } else {
    best.score <- Inf
    best.gp <- NULL
    best.model <- base.model.tree
  }

  score.improved <- TRUE

  new.nodes <- 0

  while (score.improved & new.nodes < max.new.nodes) {
    new.nodes <- new.nodes + 1

    if (reset.params & best.score != Inf) {
      reset.vec <- c(FALSE, TRUE)
    } else {
      reset.vec <- FALSE
    }

    score.improved <- FALSE
    next.models <- generate.next.models(best.model)

    if (!revisit.kernels) {
      duplicate.models <- numeric(0)

      for (i in seq_along(next.models)) {
        if (as.character(next.models[[i]]) %in% evaluated.models) {
          duplicate.models <- c(duplicate.models, i)
        }
      }
      if (length(duplicate.models) > 0) {
        next.models <- next.models[-duplicate.models]
      }
    }

    prev.best.hyper.params <- best.gp$optimized.hyperparams
    prev.best.sigma.n <- best.gp$optimized.sigma.n

    for (i in 1:length(next.models)) {
      for (reset in reset.vec) {
        current.model <- next.models[[i]]
        current.k <- create.kernel.object.from.model.tree(current.model)
        current.hyper.params <- rep(NA, length(current.k$hyperparam_names))
        names(current.hyper.params) <- current.k$hyperparam_names
        if (!reset) {
          current.hyper.params[names(prev.best.hyper.params)] <- prev.best.hyper.params
        }
        convcode <- 1
        counter <- 0
        while (convcode > 0 & counter < 3) {
          # This didn't converge - try again a few times, then give up and ignore this model
          counter <- counter + 1
          tryCatch({
            gp.obj <- create.gaussian.process(x, y, current.k, cache)
            result <- fit.hyperparams(gp.obj, hyper.params.init=current.hyper.params, ...)
            gp.obj <- result$gp
            convcode <- result$optimx.obj["convcode"]
          }, error=function(e) {
            print(e)
            convcode <- 1
          })
        }

        if (convcode == 0) {
          model.score <- scoring.function(gp.obj, ...)

          model.name <- paste(as.character(gp.obj$kernel$kernel), as.character(reset), sep=".")

          model.score.vec[model.name] <- model.score

          scores[model.name, scoring.function.name] <- model.score.vec[model.name]

          for (score_name_temp in names(alternate.scoring.functions)) {
            scores[model.name, score_name_temp] <- alternate.scoring.functions[[score_name_temp]](gp.obj, ...)
          }

          if (return.all.models) {
            output$model.list[[model.name]] <- gp.obj
          }

          evaluated.models <- c(evaluated.models, as.character(gp.obj$kernel$kernel))
          print(paste("Model:", as.character(gp.obj$kernel$kernel)))
          print(paste("Model Score:", model.score))
          print("Model hyperparams:")
          print(gp.obj$optimized.hyperparams)
          print(paste("sigma_n:", gp.obj$optimized.sigma.n))
          print(as.matrix(model.score.vec[order(model.score.vec)]))
          print(as.matrix(scores[order(model.score.vec), ]))
          if (plot.gp) plot(gp.obj, main=as.character(gp.obj$kernel$kernel))
          if (model.score < best.score) {
            best.score <- model.score
            best.gp <- gp.obj
            best.model <- gp.obj$kernel$kernel
            score.improved <- TRUE
          }
        } else {
          warnmess <- paste("Model", as.character(current.model), "failed to converge")
          warning(warnmess)
          print(warnmess)
        }
      }
    }
  }
  output$best.gp <- best.gp
  output$scores <- scores
  return(output)
}
mattdneal/gaussianProcess documentation built on May 21, 2019, 12:58 p.m.