## GP VI functions required to implement GP VI approximation from
## Titsias
## compute the trace term (as called that in Bauer et al. 2016)
#' Calculate the trace term in the ELBO.
#' @description Calculate the trace term in the ELBO for use in the variational approximation.
#'
#' @param cov_par A named list of covariance parameters.
#' @param Sigma12 Covariance matrix between observed data locations (rows) and knots (columns)
#' @param Sigma22 Variance covariance matrix at the knots.
#' @param delta A small diagonal perturbation to Sigma22 for numerical stability.
#' @return The value of the trace term in the ELBO.
#' @export
trace_term_fun <- function(cov_par, Sigma12, Sigma22, delta)
{
tau <- cov_par$tau
sigma <- cov_par$sigma
Z2 <- solve(a = Sigma22, b = t(Sigma12))
Z3 <- Sigma12 * t(Z2)
Z4 <- apply(X = Z3, MARGIN = 1, FUN = sum)
## Note that Lambda here is the conditional variance matrix of the latent
## function, so tau does not appear
Lambda <- sigma^2 + delta - Z4
return(-(1/(2 * tau^2)) * sum(Lambda))
}
## compute derivative of trace term
#' Derivative of the trace term.
#' @description Calculate the derivative of the trace term with respect to the nugget/noise.
#'
#' @param cov_par A named list of covariance parameters.
#' @param trace_term Scalar value of the current trace term in the ELBO.
#' @return The derivative of the trace term with respect to the nugget/noise.
#' @export
dtrace_term_dtau <- function(cov_par, trace_term)
{
## note that this is the derivative wrt log tau
tau <- cov_par$tau
sigma <- cov_par$sigma
return(-2 * trace_term)
}
#' Derivative of the trace term.
#' @description Calculate the derivative of the trace term with respect to any covariance
#' parameters different from the nugget.
#'
#' @param cov_par A named list of covariance parameters.
#' @param A_trace A value calculated internally in the ELBO gradient function.
#' @return The derivative of the trace term with respect to the nugget/noise.
#' @export
dtrace_term_dcov_par <- function(cov_par, A_trace)
{
tau <- cov_par$tau
sigma <- cov_par$sigma
return(-(1/(2 * tau^2)) * sum(A_trace))
}
## compute the elbo
#' @export
elbo_fun <- function(ff = NA, mu, Z, Sigma12,
Sigma22, y,
trace_term_fun = trace_term_fun,
cov_par, ...)
{
## ff are the (maximized) GP values at the observed data locations
## y is the vector of observed data
## m is a vector of the areas of each grid cell where counts are observed
## mu is a vector of GP means of the same length as ff
## Sigma12 is the matrix of covariances between the GP at the observed and knot locations
## Sigma22 is the variance covariance matrix at the knot locations
## Z is a vector of ----- sigma^2 + tau^2 - Sigma12 %*% Sigma22^(-1) %*% Sigma21
## trace_term_fun is a function to compute the trace term
## make sure ff is numeric
# ff <- as.numeric(ff)
y <- as.numeric(y)
## second derivative of p(y|xu, theta) wrt ff
args <- list(...)
delta <- args$delta
## compute the quadratic form of the GP contribution to the objective function
ZSig12 <- (1/Z) * Sigma12
## compute cholesky decomposition of Sigma22 + Sigma21 %*% Z_inv Sigma12 and the log of the det
# print(Sigma22)
# print(t(Sigma12) %*% ZSig12)
# print(Sigma22 + t(Sigma12) %*% ZSig12)
# print(head(diag(Sigma22)))
# print(head(diag(Sigma22 + t(Sigma12) %*% ZSig12)))
R <- chol(x = Sigma22 + t(Sigma12) %*% ZSig12)
logdetR <- 2 * sum(log(diag(R)))
# quad_form_part <- -(1/2) * (t(y - mu) %*% ( (1/Z) * (y - mu))) +
# (1/2) * (t(y - mu) %*% ZSig12) %*% solve(a = Sigma22 + t(Sigma12) %*% ZSig12 , b = t((t(y - mu) %*% ZSig12)))
quad_form_part <- -(1/2) * (t(y - mu) %*% ( (1/Z) * (y - mu))) +
(1/2) * (t(y - mu) %*% ZSig12) %*% solve(a = R, b = solve(a = t(R), b = t((t(y - mu) %*% ZSig12))))
## compute the contribution from the determinents (CORRECT)
det_part <- -(1/2) * ( sum(log(Z)) - log(det(Sigma22)) +
# log(det( solve(Sigma22) + t(Sigma12) %*% ZSig12 ))
logdetR
)
## compute the trace term
trace_term <- trace_term_fun(cov_par = cov_par,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
delta = delta)
return(
quad_form_part + det_part - (length(y) / 2) * log(2*pi) + trace_term
)
}
## gradient of the ELBO wrt covariance parameters
#' @export
delbo_dcov_par <- function(cov_par,
cov_fun,
dcov_fun_dtheta,
dcov_fun_dknot = NA,
knot_opt,
xu, xy, y,
ff = NA,
mu, transform = TRUE,
delta = 1e-6, ...)
{
## cov_par is a list of the covariance function parameters
## NAMES AND ORDERING MUST MATCH dcov_fun_dtheta
## cov_fun is a string denoting either "sqexp" or "exp" for the squared exponential or exponential covariance functions
## xy are the observed data locations (matrix where rows are observations)
## xu are the unobserved knot locations (matrix where rows are knot locations)
## y is the vector of the observed data values
## mu is the mean of the GP at each of the observed data locations
## ff is the current value for the ff vector that maximizes log(p(y|lambda)) + log(p(ff|theta, xu))
## dcov_fun_dtheta is a list of functions which corresponds (in order) to the covariance parameters in
## cov_par. These functions compute the derivatives of the covariance function wrt the particular covariance parameter.
## NAMES AND ORDERING MUST MATCH cov_par
## dcov_fun_dknot is a single function that gives the derivative of the covariance function
## wrt to the second input location x2, which I will always use for the knot location.
## knot_opt is a vector giving the knots over which to optimize, other knots are held constant
## transform is a logical argument that says whether to optimize on scales such that paramters are unconstrained
## ... are argument to be passed to the functions taking derivatives of log(p(y|lambda)) wrt ff
## store transformed parameter values if transform == TRUE
if(transform == TRUE)
{
trans_par <- cov_par
}
## initialize gradient
if(is.list(dcov_fun_dtheta))
{
grad <- numeric(length = length(cov_par))
names(grad) <- names(cov_par)
}
if(!is.list(dcov_fun_dtheta))
{
grad <- 0
}
if(is.function(dcov_fun_dknot))
{
grad_knot <- numeric(length = nrow(xu) * ncol(xu))
## get bounds for knots
knot_bounds_l <- apply(X = xy, MARGIN = 2, FUN = min)
knot_bounds_u <- apply(X = xy, MARGIN = 2, FUN = max)
diffs <- knot_bounds_u - knot_bounds_l
knot_bounds <- cbind(knot_bounds_l - diffs/10, knot_bounds_u + diffs/10)
}
trans_knot <- xu
## make sure y is a vector
y <- as.numeric(y)
## create matrices that we will need for every derivative calculation
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(xy), sep = "")
## create Sigma12
Sigma12 <- make_cov_mat_ardC(x = xy, x_pred = xu,
cov_par = cov_par,
cov_fun = cov_fun,
delta = delta,
lnames = lnames)
## create Sigma22
Sigma22 <- make_cov_mat_ardC(x = xu,
x_pred = matrix(),
cov_fun = cov_fun,
cov_par = cov_par,
delta = delta,
lnames = lnames) - as.list(cov_par)$tau^2 * diag(nrow(xu))
}
else{
lnames <- character()
## create Sigma12
Sigma12 <- make_cov_matC(x = xy, x_pred = xu,
cov_par = cov_par,
cov_fun = cov_fun,
delta = delta)
## create Sigma22
Sigma22 <- make_cov_matC(x = xu,
x_pred = matrix(),
cov_fun = cov_fun,
cov_par = cov_par,
delta = delta) - as.list(cov_par)$tau^2 * diag(nrow(xu))
}
## create Z and W vectors and quantities
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
FF <- solve(a = Sigma22, b = t(Sigma12))
Z <- numeric()
Z <- rep(cov_par$tau^2 + delta, times = nrow(Sigma12))
B <- 1/Z
R <- chol(Sigma22 + t(Sigma12) %*% ((1/Z) * Sigma12))
# FF <- solve(a = Sigma22, b = t(Sigma12))
## create the inverse of a commonly used m x m matrix (if m is the number of knot locations)
## maybe replace with cholesky decomposition eventually
# RC <- chol(Sigma22 + t(Sigma12) %*% (B * Sigma12))
C <- solve(a = Sigma22 + t(Sigma12) %*% (B * Sigma12))
## Perform other matrix computations shared by all derivatives
## compute component 2
## t(y - mu) %*% Sigma_inverse %*% dSigma/dtheta %*% Sigma_inverse %*% (y - mu)
comp2_1 <- (1/Z) * (y - mu) - t(solve(a = R, b = solve(a = t(R), b = t( (1/Z) * Sigma12 )))) %*%
(t((1/Z) * Sigma12) %*% (y - mu))
## compute current trace term
current_trace_term <- trace_term_fun(cov_par = cov_par,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
delta = delta)
## skip derivatives of the covariance function wrt theta if dcov_fun_dtheta is not a list
if(is.list(dcov_fun_dtheta))
{
## loop through each covariance parameter
for(p in 1:length(cov_par))
{
par_name <- names(cov_par)[p]
## store transformed parameter values
if(par_name %in% lnames)
{
trans_par[[p]] <- pos_to_real(x = cov_par[[which(names(cov_par) == par_name)]])
}
else{
trans_par[[which(names(cov_par) == par_name)]] <-
dcov_fun_dtheta[[which(names(dcov_fun_dtheta) == par_name)]](x1 = 0,
x2 = 0,
cov_par = cov_par,
transform = transform)$trans_par
}
## first compute dSigma12/dtheta_j
if(cov_fun == "ard")
{
## first compute dSigma12/dtheta_j
dSigma12_dtheta <- dsig_dtheta_ardC(x = xy,
x_pred = xu,
cov_par = cov_par,
cov_fun = cov_fun,
par_name = par_name,
lnames = lnames)
## next compute dSigma22/dtheta_j
dSigma22_dtheta <- dsig_dtheta_ardC(x = xu,
x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun,
par_name = par_name,
lnames = lnames)
}
else{
## first compute dSigma12/dtheta_j
dSigma12_dtheta <- dsig_dthetaC(x = xy,
x_pred = xu,
cov_par = cov_par,
cov_fun = cov_fun,
par_name = par_name)
## next compute dSigma22/dtheta_j
dSigma22_dtheta <- dsig_dthetaC(x = xu,
x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun,
par_name = par_name)
}
## NOTE: for Gaussian data, tau is a special case where dSigma22_dtheta = 0
if(par_name == "tau")
{
dSigma22_dtheta <- matrix(0, nrow = nrow(xu), ncol = nrow(xu))
}
##########################################################################
## create components to compute the dervative of the trace term
## Create the vector A_trace = diag( dvar(f)/dtheta - (d/dtheta) Sigma12 %*% Sigma22_inverse %*% Sigma21)
## Note that for the noise variance/tau, A is zero... At least in the Gaussian data case.
if(par_name != "tau")
{
A1_trace <- numeric(length = nrow(xy))
if(cov_fun != "ard" | !(par_name %in% lnames))
{
for(i in 1:length(A1_trace))
{
temp <- eval(
substitute(expr = dcov_fun_dtheta$par(x1 = a, x2 = b, cov_par = c, transform = d),
env = list("a" = xy[i,], "b" = xy[i,], "c" = cov_par,
"par" = par_name, "d" = transform))
)
A1_trace[i] <- temp$derivative
}
}
#A1 <- rep(2 * cov_par$sigma * (ifelse(test = transform == TRUE, yes = cov_par$sigma, no = 1)), times = nrow(xy))
A2_trace <- numeric(length = nrow(xy))
# temp1 <- (2 * dSigma12_dtheta - Sigma12 %*% solve(a = Sigma22, b = dSigma22_dtheta))
temp1 <- (2 * dSigma12_dtheta - t(FF) %*% dSigma22_dtheta )
A2_trace <- apply(X = temp1 * t(FF), MARGIN = 1, FUN = sum)
# temp2 <- FF
# for(i in 1:length(A2))
# {
# A2[i] <- temp1[i,] %*% temp2[,i]
# }
A_trace <- A1_trace - A2_trace
##########################################################################
}
## Create the vector A = diag( dvar(y)/dtheta - (d/dtheta) Sigma12 %*% Sigma22_inverse %*% Sigma21)
## if we are dealing with transformed parameters make sure you have the derivative dsigma/dsigma_transformed!!
A1 <- numeric(length = nrow(xy))
if(par_name == "tau")
{
for(i in 1:length(A1))
{
temp <- eval(
substitute(expr = dcov_fun_dtheta$par(x1 = a, x2 = b, cov_par = c, transform = d),
env = list("a" = xy[i,], "b" = xy[i,], "c" = cov_par, "par" = par_name, "d" = transform))
)
A1[i] <- temp$derivative
}
}
A <- A1
## we compute the gradient by summing some components together
## compute component 1
## trace( (Sigma)_inverse dSigma/dtheta_j)
comp1_1 <- sum(A * B) - sum( diag( C %*% t(Sigma12) %*% (B * A * B * Sigma12)) )
# comp1_1 <- sum(A * B) - sum( diag( solve( a = RC, b = solve( t(RC) , b = t(Sigma12) ) ) %*% (B * A * B * Sigma12)) )
comp1_2_1 <- 2 * sum( diag(solve(a = Sigma22, b = t(Sigma12) %*% (B * dSigma12_dtheta))) )
comp1_2_2 <- sum( diag( FF %*% t(solve(a = Sigma22, b = t(B * Sigma12))) %*% dSigma22_dtheta ))
comp1_2_3 <- 2 * sum( diag( (FF %*% (B * Sigma12)) %*% (C %*% t(Sigma12) %*% (B * dSigma12_dtheta)) ))
comp1_2_4 <- sum( diag( (FF %*% (B * Sigma12) ) %*%
((C %*% t(Sigma12)) %*% (B * Sigma12) ) %*%
solve(a = Sigma22, b = dSigma22_dtheta)))
comp1 <- comp1_1 + comp1_2_1 - comp1_2_2 - comp1_2_3 + comp1_2_4
## compute component 2
## t(y - mu) %*% Sigma_inverse %*% dSigma/dtheta %*% Sigma_inverse %*% (y - mu)
# comp2_1 <- (1/Z) * (y - mu) - t(solve(a = R, b = solve(a = t(R), b = t( (1/Z) * Sigma12 )))) %*%
# (t((1/Z) * Sigma12) %*% (y - mu))
comp2_2 <- A * comp2_1 + 2 * dSigma12_dtheta %*% solve(a = Sigma22, b = t(Sigma12) %*% comp2_1) -
t(FF) %*% dSigma22_dtheta %*% solve(a = Sigma22, b = t(Sigma12) %*% comp2_1)
comp2 <- t(comp2_1) %*% comp2_2
## compute derivative of the trace term
if(par_name == "tau")
{
dtrace_term <- dtrace_term_dtau(cov_par = cov_par,
trace_term = current_trace_term)
}
if(par_name != "tau")
{
dtrace_term <- dtrace_term_dcov_par(cov_par = cov_par,
A_trace = A_trace)
}
## compute dlog(q(y|theta))/dtheta (the derivative of the laplace approximation wrt
## the p-th covariance parameter)
grad[which(names(grad) == par_name)] <- (1/2) * comp2 -
(1/2) * comp1 + dtrace_term
}
}
## optimize the knot locations if there is a derivative function for the knot locations
## loop through each covariance parameter
if(is.function(dcov_fun_dknot))
{
## make next two functions return sparse matrix
## function to create d(Sigma12)/dknot[k,d]
dsig12_dknot <- function(k,d, cov_par,
dcov_fun_dknot,
xu, xy,
bounds = knot_bounds,
transform = TRUE)
{
## m indexes the knot
## d indexes the dimension of the knot
vec_m <- numeric(length = nrow(xy))
for(i in 1:length(vec_m))
{
temp <- eval(
substitute(expr = dcov_fun_dknot(x1 = a, x2 = b, cov_par = c, transform = e, bounds = h),
env = list("a" = xy[i,], "b" = xu[k,], "c" = cov_par, "e" = transform, "h" = bounds))
)
vec_m[i] <- temp$derivative[d]
}
mat <- Matrix::Matrix(data = 0, nrow = nrow(xy), ncol = nrow(xu))
mat[,k] <- vec_m
return(mat)
}
## function to create d(Sigma22)/dknot[k,d]
dsig22_dknot <- function(k,d,
cov_par,
dcov_fun_dknot,
xu, xy,
transform = TRUE,
bounds = knot_bounds)
{
## note that because the derivative function is wrt x2, the fixed index m
## must always be in the second argument
mat <- Matrix::Matrix(data = 0, nrow = nrow(xu), ncol = nrow(xu))
for(i in 1:nrow(mat))
{
temp <- eval(
substitute(expr = dcov_fun_dknot(x1 = a, x2 = b, cov_par = c, transform = d, bounds = h),
env = list("a" = xu[i,], "b" = xu[k,], "c" = cov_par, "d" = transform, "h" = bounds))
)
mat[i,k] <- temp$derivative[d]
}
for(i in 1:nrow(mat))
{
temp <- eval(
substitute(expr = dcov_fun_dknot(x1 = a, x2 = b, cov_par = c, transform = d, bounds = h),
env = list("a" = xu[i,], "b" = xu[k,], "c" = cov_par, "d" = transform, "h" = bounds))
)
mat[k,i] <- temp$derivative[d]
}
return(mat)
}
## k indexes the knot, d indexes the dimension within the knot
p <- 0
# for(k in 1:length(knot_opt))
for(k in 1:nrow(xu))
{
## store transformed parameter values
if(transform == TRUE)
{
trans_knot[k,] <- dcov_fun_dknot(x1 = 0, x2 = xu[k,], cov_par = cov_par,
transform = transform, bounds = knot_bounds)$trans_par
}
for(d in 1:ncol(xu))
{
## p is the index for the knot gradient vector of length m * d
p <- p + 1
## make the gradient zero if current m is not one of the knots we
## are optimizing over
if(!k %in% knot_opt)
{
grad_knot[p] <- 0
}
if(k %in% knot_opt)
{
# par_name <- names(cov_par)[p]
## first compute dSigma12/dknot[k,d]
dSigma12_dknot <- dsig12_dknot(k = k, d = d,
cov_par = cov_par,
dcov_fun_dknot = dcov_fun_dknot,
xu = xu,
xy = xy,
transform = transform,
bounds = knot_bounds)
## next compute dSigma22/dtheta_j
dSigma22_dknot <- dsig22_dknot(k = k, d = d,
cov_par = cov_par,
dcov_fun_dknot = dcov_fun_dknot,
xu = xu,
transform = transform,
bounds = knot_bounds)
##########################################################################
## create components to compute the dervative of the trace term
## Create the vector A_trace = diag( dvar(f)/dtheta - (d/dtheta) Sigma12 %*% Sigma22_inverse %*% Sigma21)
## if we are dealing with transformed parameters make sure you have the derivative dsigma/dsigma_transformed!!
A1_trace <- 0
#A1 <- rep(2 * cov_par$sigma * (ifelse(test = transform == TRUE, yes = cov_par$sigma, no = 1)), times = nrow(xy))
A2_trace <- numeric(length = nrow(xy))
temp1 <- (2 * dSigma12_dknot - t(FF) %*% dSigma22_dknot)
A2_trace <- apply(X = temp1 * t(FF), MARGIN = 1, FUN = sum)
# temp2 <- FF
# for(i in 1:length(A2))
# {
# A2[i] <- temp1[i,] %*% temp2[,i]
# }
A_trace <- A1_trace - A2_trace
##########################################################################
## Create the vector A = diag( dvar(y)/dtheta - (d/dtheta) Sigma12 %*% Sigma22_inverse %*% Sigma21)
## if we are dealing with transformed parameters make sure you have the derivative dsigma/dsigma_transformed!!
A1 <- 0
A <- A1
## we compute the gradient by summing some components together
## compute component 1
## trace( (Sigma)_inverse dSigma/dtheta_j)
comp1_1 <- sum(A * B) - sum( diag(C %*% t(Sigma12) %*% (B * A * B * Sigma12)) )
comp1_2_1 <- 2 * sum( diag(solve(a = Sigma22, b = as.matrix(t(Sigma12) %*% (B * dSigma12_dknot))) ) )
comp1_2_2 <- sum( diag( as.matrix(FF %*% t(solve(a = Sigma22, b = t(B * Sigma12))) %*% dSigma22_dknot) ))
comp1_2_3 <- 2 * sum( diag( as.matrix( (FF %*% (B * Sigma12)) %*% (C %*% t(Sigma12) %*% (B * dSigma12_dknot))) ))
comp1_2_4 <- sum( diag( (FF %*% (B * Sigma12) ) %*%
((C %*% t(Sigma12)) %*% (B * Sigma12) ) %*%
solve(a = Sigma22, b = as.matrix(dSigma22_dknot) )))
comp1 <- comp1_1 + comp1_2_1 - comp1_2_2 - comp1_2_3 + comp1_2_4
## compute component 2
## t(y - mu) %*% Sigma_inverse %*% dSigma/dtheta %*% Sigma_inverse %*% (y - mu)
# comp2_1 <- (1/Z) * (y - mu) - t(solve(a = Sigma22 + t(Sigma12) %*% ((1/Z) * Sigma12), b = t( (1/Z) * Sigma12 ))) %*%
# (t((1/Z) * Sigma12) %*% (y - mu))
# comp2_1 <- (1/Z) * (y - mu) - t( solve(a = R, b = solve(a = t(R), b = t( (1/Z) * Sigma12 )))) %*%
# (t((1/Z) * Sigma12) %*% (y - mu))
comp2_2 <- A * comp2_1 + 2 * dSigma12_dknot %*% solve(a = Sigma22, b = t(Sigma12) %*% comp2_1) -
t(FF) %*% dSigma22_dknot %*% solve(a = Sigma22, b = t(Sigma12) %*% comp2_1)
comp2 <- t(comp2_1) %*% comp2_2
## compute derivative of the trace term wrt the knot
dtrace_term <- dtrace_term_dcov_par(cov_par = cov_par, A_trace = A_trace)
## compute dlog(q(y|theta))/dtheta (the derivative of the laplace approximation wrt
## the p-th knot)
grad_knot[p] <- (1/2) * as.numeric(comp2) -
(1/2) * as.numeric(comp1) + dtrace_term
}
}
}
}
if(is.function(dcov_fun_dknot))
{
return(list("gradient" = grad, "knot_gradient" = grad_knot, "trans_par" = trans_par, "trans_knot" = trans_knot))
}
if(is.na(dcov_fun_dknot))
{
return(list("gradient" = grad, "trans_par" = trans_par))
}
}
## Gradient Ascent with the ELBO
#' @export
norm_grad_ascent_vi <- function(cov_par_start,
cov_fun,
dcov_fun_dtheta,
dcov_fun_dknot,
knot_opt,
xu,
xy,
y,
mu = NA,
muu = NA,
obj_fun = elbo_fun,
opt = list(),
verbose = FALSE,
...)
{
## cov_par_start is a list of the starting values of the covariance function parameters
## NAMES AND ORDERING MUST MATCH dcov_fun_dtheta
## cov_fun is a string specifying the covariance function ("sqexp" or "exp")
## xy are the observed data locations (matrix where rows are observations)
## knot_opt is a vector giving the knots over which to optimize, other knots are held constant
## xu are the unobserved knot locations (matrix where rows are knot locations)
## y is the vector of the observed data values
## mu is the mean of the GP at each of the observed data locations
## muu is the mean of the GP at each of the knot locations
## dcov_fun_dtheta is a list of functions which corresponds (in order) to the covariance parameters in
## cov_par. These functions compute the derivatives of the covariance function wrt the particular covariance parameter.
## NAMES AND ORDERING MUST MATCH cov_par
## dcov_fun_dknot is a single function that gives the derivative of the covariance function
## wrt to the second input location x2, which I will always use for the knot location.
## transform is a logical argument that says whether to optimize on scales such that paramters are unconstrained
## obj_fun is the objective function
## learn_rate is the learning rate (step size) for the algorithm
## maxit is the maximum number of iterations before cutting it off
## tol is the absolute tolerance level which dictates
## how small a difference the algorithm is allowed to make before stopping
## to the objective function as well as how close the gradient is allowed to be to zero
## ... are argument to be passed to the functions taking derivatives of log(p(y|lambda)) wrt ff
## This is an old argument that is necessary for gradient functions
## indicating the need for the gradient of a transformed parameter
transform <- TRUE
## jiggle xu to make sure that knot locations aren't exactly at data locations
## only necessary so that the covariance function doesn't add a nugget
# ## between two nuggetless function values
# for(i in 1:nrow(xu))
# {
# if(any(
# apply(X = xy, MARGIN = 1, FUN = function(x,y){isTRUE(all.equal(target = y, current = x))}, y = xu[i,]) == TRUE
# )
# )
# {
# xu[i,] <- xu[i,] + rnorm(n = ncol(xu),sd = 1e-6)
# }
# }
## extract names of optional arguments from opt
opt_master = list("optim_method" = "adadelta",
"decay" = 0.95, "epsilon" = 1e-6, "learn_rate" = 1e-2, "eta" = 1e3,
"maxit" = 1000, "obj_tol" = 1e-3, "grad_tol" = Inf, "delta" = 1e-6)
## potentially change default values in opt_master to user supplied values
if(length(opt) > 0)
{
for(i in 1:length(opt))
{
## if an argument is misspelled or not accepted, throw an error
if(!any(names(opt_master) == names(opt)[i]))
{
# print("Warning: you supplied an argument to opt() not utilized by this function.")
next
}
else{
ind <- which(names(opt_master) == names(opt)[i])
opt_master[[ind]] <- opt[[i]]
}
}
}
optim_method = opt_master$optim_method
optim_par = list("decay" = opt_master$decay, "epsilon" = opt_master$epsilon,
"eta" = opt_master$eta, "learn_rate" = opt_master$learn_rate)
maxit = opt_master$maxit
obj_tol = opt_master$obj_tol
grad_tol = opt_master$grad_tol
delta <- opt_master$delta
if(!is.numeric(mu))
{
mu <- rep(mean(y), times = length(y))
}
if(!is.numeric(muu))
{
muu <- rep(mean(y), times = nrow(xu))
}
## make sure y is numeric
y <- as.numeric(y)
## store objective function evaluations, gradient values, and parameter values
## initialize vector of objective function values
current_cov_par <- unlist(cov_par_start) ## vector of covariance parameter values (without inducing points)
cov_par_vals <- matrix(ncol = length(current_cov_par)) ## store the covariance parameter values
cov_par <- as.list(current_cov_par)
obj_fun_vals <- numeric() ## store the objective function values
current_obj_fun <- NA ## current objective function value
if(is.list(dcov_fun_dtheta))
{
grad_vals <- matrix(ncol = length(current_cov_par)) ## store gradient values
current_grad_val_theta <- NA ## current gradient value
}
if(!is.list(dcov_fun_dtheta))
{
current_grad_val_theta <- 0 ## current gradient value
grad_vals <- matrix(ncol = length(current_cov_par)) ## store gradient values
}
grad_knot_vals <- matrix(ncol = nrow(xu) * ncol(xu)) ## store gradient values
current_grad_val_knot <- NA ## current gradient value
## current objective function value
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = xy, x_pred = xu, cov_par = cov_par,
cov_fun = cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = xu, x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun,
delta = delta,
lnames = lnames) -
as.list(cov_par)$tau^2 * diag(nrow(xu))
}
else{
lnames <- character()
Sigma12 <- make_cov_matC(x = xy, x_pred = xu, cov_par = cov_par, cov_fun = cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = xu, x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun, delta = delta) - as.list(cov_par)$tau^2 * diag(nrow(xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
Z <- rep(cov_par$tau^2 + delta , times = nrow(Sigma12))
current_obj_fun <- obj_fun(mu = mu,
Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par, delta = delta, ...) ## store the ELBO value
obj_fun_vals[1] <- current_obj_fun
cov_par_vals[1,] <- current_cov_par
temp_grad_eval <- delbo_dcov_par(cov_par = cov_par, cov_fun = cov_fun, ## store the results of gradient function
dcov_fun_dtheta = dcov_fun_dtheta,
dcov_fun_dknot = dcov_fun_dknot,
knot_opt = knot_opt,
xu = xu, xy = xy,
y = y, ff = NA, mu = mu, delta = delta,
transform = transform)
if(is.list(dcov_fun_dtheta))
{
current_grad_val_theta <- c(temp_grad_eval$gradient) ## store the current gradient
grad_vals[1,] <- current_grad_val_theta
}
current_cov_par_trans <- unlist(temp_grad_eval$trans_par) ## store the current value of the transformed covariance parameters
## potentially record the gradient of the knot locations
if(is.function(dcov_fun_dknot))
{
current_grad_val_knot <- c(temp_grad_eval$knot_gradient) ## store the current gradient
grad_knot_vals[1,] <- current_grad_val_knot
xu_vals <- xu
## get bounds for knots
knot_bounds_l <- apply(X = xy, MARGIN = 2, FUN = min)
knot_bounds_u <- apply(X = xy, MARGIN = 2, FUN = max)
diffs <- knot_bounds_u - knot_bounds_l
knot_bounds <- cbind(knot_bounds_l - diffs/10, knot_bounds_u + diffs/10)
current_xu_trans <- temp_grad_eval$trans_knot
}
if(!is.function(dcov_fun_dknot))
{
current_grad_val_knot <- 0
}
## run gradient ascent updates until convergence criteria is met or the maxit is reached
iter <- 1
# obj_fun_vals[1] <- 1e4 ## this is a line to make the objective fn check work in while loop...
## optimization for optim_method = "ga"
if(optim_method == "ga")
{
while(iter < maxit &&
(any(abs(c(current_grad_val_theta, current_grad_val_knot)) > grad_tol) ||
ifelse(iter > 1, yes = abs(current_obj_fun - obj_fun_vals[iter - 1]) > obj_tol, no = TRUE)))
{
## update the iteration counter
iter <- iter + 1
if(verbose == TRUE)
{
print(paste("iteration ", iter, sep = ""))
print(c(current_grad_val_theta, current_grad_val_knot))
# print(current_obj_fun)
}
if(transform == TRUE)
{
## take a GA step in transformed space
if(is.list(dcov_fun_dtheta))
{
current_cov_par_trans <- current_cov_par_trans + optim_par$learn_rate * current_grad_val_theta
## recover the untransformed parameter values
for(j in 1:length(cov_par))
{
## transform the transformed parameters back to the original parameter space
if(cov_fun == "ard" & names(cov_par)[j] %in% lnames)
{
current_cov_par[j] <- real_to_pos(x = current_cov_par_trans[j])
}
else{
current_cov_par[j] <- dcov_fun_dtheta[[which(names(dcov_fun_dtheta) == names(current_cov_par)[j])]](x1 = 0, x2 = 2,
cov_par = cov_par_start,
transform = TRUE)$trans_fun(current_cov_par_trans[j])
}
}
cov_par <- as.list(current_cov_par)
}
if(is.function(dcov_fun_dknot))
{
## Note that the gradient vector for the knots looks like this
## c( xu[1,1], xu[1,2], ..., xu[1,d], xu[2,1], ..., xu[m,d] )
current_xu_trans <- current_xu_trans + optim_par$learn_rate * matrix(data = current_grad_val_knot,
nrow = nrow(xu),
ncol = ncol(xu),
byrow = TRUE)
xu <- apply(X = current_xu_trans, MARGIN = 1,
FUN = dcov_fun_dknot(x1 = 0, x2 = 0, cov_par = cov_par, transform = TRUE, bounds = knot_bounds)$trans_fun,
bounds = knot_bounds)
xu <- matrix(xu, ncol = ncol(xy), byrow = TRUE)
xu_vals <- abind::abind(xu_vals, xu, along = 3)
}
}
if(transform == FALSE)
{
current_cov_par <- current_cov_par + optim_par$learn_rate * current_grad_val_theta
cov_par <- as.list(current_cov_par)
if(is.function(dcov_fun_dknot))
{
## Note that the gradient vector for the knots looks like this
## c( xu[1,1], xu[1,2], ..., xu[1,d], xu[2,1], ..., xu[m,d] )
xu <- xu + optim_par$learn_rate * matrix(data = current_grad_val_knot,
nrow = nrow(xu),
ncol = ncol(xu),
byrow = TRUE)
xu_vals <- abind::abind(xu_vals, xu, along = 3)
}
}
## current objective function value
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = xy, x_pred = xu, cov_par = cov_par,
cov_fun = cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = xu, x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun,
delta = delta,
lnames = lnames) -
as.list(cov_par)$tau^2 * diag(nrow(xu))
}
else{
lnames <- character()
Sigma12 <- make_cov_matC(x = xy, x_pred = xu, cov_par = cov_par, cov_fun = cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = xu, x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun, delta = delta) - as.list(cov_par)$tau^2 * diag(nrow(xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
Z <- numeric()
Z <- rep(cov_par$tau^2 + delta, times = nrow(Sigma12))
current_obj_fun <- obj_fun(mu = mu,
Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par, delta = delta, ...) ## store the ELBO value
obj_fun_vals[iter] <- current_obj_fun
cov_par_vals <- rbind(cov_par_vals,current_cov_par)
temp_grad_eval <- delbo_dcov_par(cov_par = as.list(current_cov_par), cov_fun = cov_fun, ## store the results of gradient function
dcov_fun_dtheta = dcov_fun_dtheta,
dcov_fun_dknot = dcov_fun_dknot,
knot_opt = knot_opt,
xu = xu, xy = xy,
y = y, ff = NA, mu = mu,
delta = delta, transform = transform)
if(is.list(dcov_fun_dtheta))
{
current_grad_val_theta <- c(temp_grad_eval$gradient) ## store the current gradient
current_cov_par_trans <- unlist(temp_grad_eval$trans_par) ## store the current value of the transformed covariance parameters
grad_vals <- rbind(grad_vals, current_grad_val_theta)
}
## potentially record the gradient of the knot locations
if(is.function(dcov_fun_dknot))
{
current_grad_val_knot <- c(temp_grad_eval$knot_gradient) ## store the current gradient
grad_knot_vals <- rbind(grad_knot_vals, current_grad_val_knot)
}
}
}
## optimization for optim_method = "adadelta"
sg2_theta <- 0
sg2_knot <- 0
sd2_theta <- 0
sd2_knot <- 0
if(optim_method == "adadelta")
{
if(is.list(dcov_fun_dtheta))
{
## store the decaying sum of squared gradients
sg2_theta <- rep(0, times = length(current_cov_par))
## store the decaying sum of updates
sd2_theta <- rep(0, times = length(current_cov_par))
sign_change_theta <- rep(0, times = length(current_cov_par))
}
if(is.function(dcov_fun_dknot))
{
## store the decaying sum of squared gradients
sg2_knot <- rep(0, times = nrow(xu)*ncol(xu))
## store the decaying sum of updates
sd2_knot <- rep(0, times = nrow(xu)*ncol(xu))
sign_change_knot <- rep(0, times = nrow(xu)*ncol(xu))
}
## do the optimization
while(iter < maxit &&
(any(abs(c(current_grad_val_theta, current_grad_val_knot)) > grad_tol) ||
ifelse(iter > 1, yes = abs(current_obj_fun - obj_fun_vals[iter - 1]) > obj_tol, no = TRUE)))
{
## update the iteration counter
iter <- iter + 1
#
if(verbose == TRUE)
{
print(paste("iteration ", iter, sep = ""))
print(c(current_grad_val_theta, current_grad_val_knot))
# print(current_cov_par)
# print(current_obj_fun)
}
if(transform == TRUE)
{
## take a adadelta step in transformed space
if(is.list(dcov_fun_dtheta))
{
## accumulate the gradient
sg2_theta <- optim_par$decay * sg2_theta + (1 - optim_par$decay) * current_grad_val_theta^2
## calculate adapted step size
# delta_theta <- (sqrt(sd2_theta + rep(optim_par$epsilon, times = length(sd2_theta)) ) / sqrt(sg2_theta + rep(optim_par$epsilon, times = length(sd2_theta)))) * current_grad_val_theta
delta_theta <- ((1/optim_par$eta)^(sign_change_theta)) * (sqrt(sd2_theta + rep(optim_par$epsilon, times = length(sd2_theta)) ) / sqrt(sg2_theta + rep(optim_par$epsilon, times = length(sd2_theta)))) * current_grad_val_theta
# delta_theta <- optim_par$learn_rate / sqrt(sg2_theta + rep(optim_par$epsilon, times = length(sd2_theta))) * current_grad_val_theta
# if(verbose == TRUE)
# {
# print("adaptive step size theta")
# # print(optim_par$learn_rate / sqrt(sg2_theta + rep(optim_par$epsilon, times = length(sd2_theta))))
# print(((1/optim_par$eta)^(sign_change_theta)) * (sqrt(sd2_theta + rep(optim_par$epsilon, times = length(sd2_theta)) ) / sqrt(sg2_theta + rep(optim_par$epsilon, times = length(sd2_theta)))))
# # print(((1/optim_par$eta)^(sign_change_theta)))
# }
## accumulate the step size
sd2_theta <- optim_par$decay * sd2_theta + (1 - optim_par$decay) * delta_theta^2
## update the parameters
current_cov_par_trans <- current_cov_par_trans + delta_theta
}
if(is.function(dcov_fun_dknot))
{
## Note that the gradient vector for the knots looks like this
## c( xu[1,1], xu[1,2], ..., xu[1,d], xu[2,1], ..., xu[m,d] )
## accumulate the gradient
sg2_knot <- optim_par$decay * sg2_knot + (1 - optim_par$decay) * current_grad_val_knot^2
## calculate adapted step size
# delta_knot <- (sqrt(sd2_knot + rep(optim_par$epsilon, times = length(sd2_knot))) / sqrt(sg2_knot + rep(optim_par$epsilon, times = length(sd2_knot)))) * current_grad_val_knot
delta_knot <- ((1/optim_par$eta)^(sign_change_knot)) * (sqrt(sd2_knot + rep(optim_par$epsilon, times = length(sd2_knot))) / sqrt(sg2_knot + rep(optim_par$epsilon, times = length(sd2_knot)))) * current_grad_val_knot
# delta_knot <- (optim_par$learn_rate / sqrt(sg2_knot + rep(optim_par$epsilon, times = length(sd2_knot)))) * current_grad_val_knot
## accumulate the step size
sd2_knot <- optim_par$decay * sd2_knot + (1 - optim_par$decay) * delta_knot^2
current_xu_trans <- current_xu_trans + matrix(data = delta_knot,
nrow = nrow(xu),
ncol = ncol(xu),
byrow = TRUE)
xu <- apply(X = current_xu_trans, MARGIN = 1,
FUN = dcov_fun_dknot(x1 = 0, x2 = 0, cov_par = cov_par, transform = TRUE, bounds = knot_bounds)$trans_fun,
bounds = knot_bounds)
xu <- matrix(xu, ncol = ncol(xy), byrow = TRUE)
xu_vals <- abind::abind(xu_vals, xu, along = 3)
}
if(is.list(dcov_fun_dtheta))
{
## recover the untransformed parameter values
for(j in 1:length(cov_par))
{
## transform the transformed parameters back to the original parameter space
if(cov_fun == "ard" & names(cov_par)[j] %in% lnames)
{
current_cov_par[j] <- real_to_pos(x = current_cov_par_trans[j])
}
else{
current_cov_par[j] <- dcov_fun_dtheta[[which(names(dcov_fun_dtheta) == names(current_cov_par)[j])]](x1 = 0, x2 = 2,
cov_par = cov_par_start,
transform = TRUE)$trans_fun(current_cov_par_trans[j])
}
}
cov_par <- as.list(current_cov_par)
}
}
## This commented code does gradient ascent... not modified adadelta
## I should probably cut out the transform argument and mandate a transformation to the whole real line
# if(transform == FALSE)
# {
# current_cov_par <- current_cov_par + optim_par$learn_rate * current_grad_val_theta
#
# if(is.function(dcov_fun_dknot))
# {
# ## Note that the gradient vector for the knots looks like this
# ## c( xu[1,1], xu[1,2], ..., xu[1,d], xu[2,1], ..., xu[m,d] )
# xu <- xu + optim_par$learn_rate * matrix(data = current_grad_val_knot,
# nrow = nrow(xu),
# ncol = ncol(xu),
# byrow = TRUE)
# xu_vals <- abind::abind(xu_vals, xu, along = 3)
# }
# }
## current objective function value
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = xy, x_pred = xu, cov_par = cov_par,
cov_fun = cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = xu, x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun,
delta = delta,
lnames = lnames) -
as.list(cov_par)$tau^2 * diag(nrow(xu))
}
else{
lnames <- character()
Sigma12 <- make_cov_matC(x = xy, x_pred = xu, cov_par = cov_par, cov_fun = cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = xu, x_pred = matrix(),
cov_par = cov_par,
cov_fun = cov_fun, delta = delta) - as.list(cov_par)$tau^2 * diag(nrow(xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
Z <- numeric()
Z <- rep(cov_par$tau^2 + delta, times = nrow(Sigma12))
current_obj_fun <- obj_fun(mu = mu,
Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par, delta = delta, ...) ## store the ELBO value
obj_fun_vals[iter] <- current_obj_fun
cov_par_vals <- rbind(cov_par_vals,current_cov_par)
temp_grad_eval <- delbo_dcov_par(cov_par = as.list(current_cov_par), cov_fun = cov_fun, ## store the results of gradient function
dcov_fun_dtheta = dcov_fun_dtheta,
dcov_fun_dknot = dcov_fun_dknot,
knot_opt = knot_opt,
xu = xu, xy = xy,
y = y, ff = NA, mu = mu, delta = delta, transform = transform)
if(is.list(dcov_fun_dtheta))
{
## get the sign change and add it to the last one
sign_change_theta <- (optim_par$decay * sign_change_theta +
(1 - optim_par$decay) * abs(sign(c(temp_grad_eval$gradient)) - sign(current_grad_val_theta))/2)
# sign_change_theta <- sign_change_theta +
# (1 - optim_par$decay) * abs(sign(c(temp_grad_eval$gradient)) - sign(current_grad_val_theta))
current_grad_val_theta <- c(temp_grad_eval$gradient) ## store the current gradient
current_cov_par_trans <- unlist(temp_grad_eval$trans_par) ## store the current value of the transformed covariance parameters
grad_vals <- rbind(grad_vals, current_grad_val_theta)
}
## potentially record the gradient of the knot locations
if(is.function(dcov_fun_dknot))
{
## get the sign change and add it to the last one
sign_change_knot <- (optim_par$decay * sign_change_knot +
(1 - optim_par$decay) * abs(sign(c(temp_grad_eval$knot_gradient)) - sign(current_grad_val_knot)))
# sign_change_knot <- sign_change_knot +
# (1 - optim_par$decay) * abs(sign(c(temp_grad_eval$knot_gradient)) - sign(current_grad_val_knot))
current_grad_val_knot <- c(temp_grad_eval$knot_gradient) ## store the current gradient
grad_knot_vals <- rbind(grad_knot_vals, current_grad_val_knot)
}
}
}
## get the posterior distribution of u
## Compute Z^(-1) Sigma12
ZSig12 <- (1/Z) * Sigma12
## compute mu + Sigma21 %*% Sigma11_inv %*% (fhat - mu)
## the posterior mean of u, the GP at the knot location
# R1 <- chol(Sigma22)
R1 <- chol(Sigma22 + t(Sigma12) %*% ZSig12)
# R2 <- chol(solve(Sigma22) + t(Sigma12) %*% ZSig12)
# u_mean <- muu + as.numeric(t(ZSig12) %*% (ff - mu)) -
# as.numeric( t(Sigma12) %*% (ZSig12 %*%
# solve(a = Sigma22 + t(Sigma12) %*% ZSig12, b = t(ZSig12) %*% (ff - mu))) )
u_mean <- muu + as.numeric(t(ZSig12) %*% (y - mu)) -
as.numeric( t(Sigma12) %*% (ZSig12 %*%
solve( a = R1, b = solve(a = t(R1), b = t(ZSig12) %*% (y - mu)))) )
## compute the posterior variance of u, the GP at the knot location
u_var <- Sigma22 - t(Sigma12) %*% ZSig12 +
(t(ZSig12) %*% t(solve(a = t(R1), b = t(Sigma12))) %*% solve(a = t(R1), b = t(Sigma12)) %*% ZSig12)
if(is.function(dcov_fun_dknot))
{
return(list("cov_par" = as.list(current_cov_par),
"cov_fun" = cov_fun,
"xu" = xu,
"xy" = xy,
"mu" = mu,
"muu" = muu,
"u_mean" = u_mean,
"u_var" = u_var,
"iter" = iter,
"obj_fun" = obj_fun_vals,
"grad" = grad_vals,
"knot_grad" = grad_knot_vals,
"knot_history" = xu_vals,
"cov_par_history" = cov_par_vals))
}
if(!is.function(dcov_fun_dknot))
{
return(list("cov_par" = as.list(current_cov_par),
"cov_fun" = cov_fun,
"xu" = xu,
"xy" = xy,
"mu" = mu,
"muu" = muu,
"u_mean" = u_mean,
"u_var" = u_var,
"cov_fun" = cov_fun,
"iter" = iter,
"obj_fun" = obj_fun_vals,
"grad" = grad_vals,
"knot_grad" = 0,
"knot_history" = xu,
"cov_par_history" = cov_par_vals))
}
}
## predict with the VI model
#' @export
predict_vi <- function(u_mean,
u_var,
xu,
x_pred,
cov_fun,
cov_par,
mu,
muu,
full_cov = FALSE,
family = "gaussian",
delta = 1e-6)
{
## u_mean is the posterior mean at the knot locations
## u_var is the posterior variance at the knot locations
## xu is a matrix where rows correspond to knots
## cov_fun is the covariance function
## cov_par are the covariance parameters
## mu is the marginal mean of the GP at locations x_pred
## full_cov is logical indicating whether full variance covariance matrix for predictions should be used
## create Sigma22
## create Sigma12 where the 1 corresponds to locations at which we'd like predictions
if(family == "gaussian")
{
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(xu), sep = "")
Sigma22 <- make_cov_mat_ardC(x = xu,
x_pred = matrix(),
cov_fun = cov_fun ,
cov_par = cov_par,
delta = delta,
lnames = lnames) -
cov_par$tau^2 * diag(length(u_mean))
}
else{
Sigma22 <- make_cov_matC(x = xu, x_pred = matrix(), cov_fun = cov_fun , cov_par = cov_par, delta = delta) -
cov_par$tau^2 * diag(length(u_mean))
}
}
if(family != "gaussian")
{
return("Error: Only Gaussian data currently supported.")
# Sigma22 <- make_cov_matC(x = xu, x_pred = matrix(), cov_fun = cov_fun , cov_par = cov_par, delta = delta)
}
## calculate Sigma22 inverse
Sigma22_inv <- solve(a = Sigma22)
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(xu), sep = "")
Sigma12 <- make_cov_mat_ardC(x = x_pred,
x_pred = xu,
cov_fun = cov_fun ,
cov_par = cov_par,
delta = delta,
lnames = lnames)
}
else{
Sigma12 <- make_cov_matC(x = x_pred, x_pred = xu, cov_fun = cov_fun , cov_par = cov_par, delta = delta)
}
## calculate the predicted mean
pred_mean <- mu + Sigma12 %*% solve(a = Sigma22, b = u_mean - muu)
## calculate the predictive variance
## create Sigma11 which is the prior variance covariance matrix at the
## locations at which we desire predictions
if(full_cov == TRUE)
{
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(x_pred), sep = "")
Sigma11 <- make_cov_mat_ardC(x = x_pred,
x_pred = matrix(),
cov_fun = cov_fun,
cov_par = cov_par,
delta = delta,
lnames = lnames)
}
else{
Sigma11 <- make_cov_matC(x = x_pred,
x_pred = matrix(),
cov_fun = cov_fun,
cov_par = cov_par,
delta = delta)
}
pred_var <- Sigma11 + Sigma12 %*%
(-Sigma22_inv + Sigma22_inv %*% u_var %*% Sigma22_inv) %*%
t(Sigma12)
}
if(full_cov == FALSE)
{
temp22 <- (-Sigma22_inv + Sigma22_inv %*% u_var %*% Sigma22_inv)
Sigma11 <- cov_par$tau^2 + cov_par$sigma^2 + delta
pred_var <- Sigma11 + apply(X = Sigma12 *
t((-Sigma22_inv + Sigma22_inv %*% u_var %*% Sigma22_inv) %*%
t(Sigma12)), MARGIN = 1, FUN = sum )
#
# pred_var <- numeric()
# for(i in 1:nrow(Sigma12))
# {
# pred_var[i] <- cov_par$tau^2 + (t(Sigma12[i,]) %*% solve(a = Sigma22, b = t(t(Sigma12[i,]) ))) +
# t(Sigma12[i,]) %*% temp22 %*% t(t(Sigma12[i,]))
# }
}
return(list("pred_mean" = pred_mean, "pred_var" = pred_var))
}
## OAT with the VI model
#' @export
oat_knot_selection_norm_vi <- function(cov_par_start,
cov_fun,
dcov_fun_dtheta,
dcov_fun_dknot,
obj_fun = elbo_fun,
xu_start,
proposal_fun,
xy,
y,
ff = NA,
mu,
muu_start,
transform = TRUE,
opt = list(),
verbose = FALSE,
...)
{
## cov_par_start is a list of the starting values of the covariance function parameters
## NAMES AND ORDERING MUST MATCH dcov_fun_dtheta
## cov_fun is the covariance function
## xy are the observed data locations (matrix where rows are observations)
## xu are the unobserved knot locations (matrix where rows are knot locations)
## proposal_fun is a function that proposes new knot locations to be optimized by gradient ascent
## y is the vector of the observed data values
## mu is the mean of the GP at each of the observed data locations
## muu is the mean of the GP at each of the knot locations
## fmax is the current value for the f vector that maximizes log(p(y|f)) + log(p(f|theta, xu))
## dcov_fun_dtheta is a list of functions which corresponds (in order) to the covariance parameters in
## cov_par. These functions compute the derivatives of the covariance function wrt the particular covariance parameter.
## NAMES AND ORDERING MUST MATCH cov_par
## dcov_fun_dknot is a single function that gives the derivative of the covariance function
## wrt to the second input location x2, which I will always use for the knot location.
## transform is a logical argument that says whether to optimize on scales such that paramters are unconstrained
## obj_fun is the objective function
## learn_rate is the learning rate (step size) for the algorithm
## maxit is the maximum number of iterations before cutting it off
## tol is the absolute tolerance level which dictates
## how small a difference the algorithm is allowed to make before stopping
## to the objective function as well as how close the gradient is allowed to be to zero
## ... are argument to be passed to the functions taking derivatives of log(p(y|lambda)) wrt ff
## as well as to the knot proposal function
## extract names of optional arguments from opt
opt_master = list("optim_method" = "adadelta",
"decay" = 0.95, "epsilon" = 1e-6, "learn_rate" = 1e-2, "eta" = 1e3,
"maxit" = 1000, "obj_tol" = 1e-3, "grad_tol" = 1,
"maxit_nr" = 1000, "delta" = 1e-6,
"tol_knot" = 1e-3, "maxknot" = 30, "TTmin" = 10, "TTmax" = 40,
"ego_cov_par" = list("sigma" = 3, "l" = 1, "tau" = 1e-3),
"ego_cov_fun" = "exp",
"ego_cov_fun_dtheta" = list("sigma" = dexp_dsigma,
"l" = dexp_dl),
"cov_diff" = TRUE,
"chooseK" = TRUE)
## potentially change default values in opt_master to user supplied values
if(length(opt) > 0)
{
for(i in 1:length(opt))
{
## if an argument is misspelled or not accepted, throw an error
if(!any(names(opt_master) == names(opt)[i]))
{
# print(paste( "Warning: you supplied an argument to opt() not utilized by this function: ",
# names(opt)[i], sep = ""))
next
}
else{
ind <- which(names(opt_master) == names(opt)[i])
opt_master[[ind]] <- opt[[i]]
}
}
}
chooseK <- opt_master$chooseK
cov_diff <- opt_master$cov_diff
optim_method = opt_master$optim_method
optim_par = list("decay" = opt_master$decay,
"epsilon" = opt_master$epsilon,
"eta" = opt_master$eta,
"learn_rate" = opt_master$learn_rate)
maxit = opt_master$maxit
obj_tol = opt_master$obj_tol
grad_tol = opt_master$grad_tol
delta <- opt_master$delta
tol_knot <- opt_master$tol_knot
maxknot <- opt_master$maxknot
# TTmin <- opt_master$TTmin
# TTmax <- opt_master$TTmax
# ego_cov_par <- opt_master$ego_cov_par
# ego_cov_fun <- opt_master$ego_cov_fun
# ego_dcov_fun_dtheta <- opt_master$ego_dcov_fun_dtheta
## possibly specify default GP mean values
if(!is.numeric(mu))
{
mu <- rep(mean(y), times = length(y))
}
if(!is.numeric(muu_start))
{
muu_start <- rep(mean(y), times = nrow(xu))
}
## store optimized objective function values
obj_fun_vals <- numeric()
## store covariance parameters at every knot optimization iteration
cov_par_vals <- as.numeric(cov_par_start)
## keep track of number of GA steps at each iteration
ga_steps <- numeric()
xu <- xu_start
muu <- muu_start
## optimize the objective fn wrt the covariance parameters only
opt <- norm_grad_ascent_vi(cov_par_start = cov_par_start,
cov_fun = cov_fun,
dcov_fun_dtheta = dcov_fun_dtheta,
dcov_fun_dknot = NA, xu = xu,
xy = xy, ff = NA, y = y, knot_opt = NA,
mu = mu, muu = muu,
# transform = transform,
obj_fun = obj_fun,
opt = opt_master,
verbose = verbose, ...
)
current_obj_fun <- opt$obj_fun[length(opt$obj_fun)] ## current objective function value
obj_fun_vals[1] <- opt$obj_fun[length(opt$obj_fun)] ## store objective function value histories
cov_par_vals <- rbind(cov_par_vals, as.numeric(opt$cov_par)) ## store covariance parameter value histories
cov_par <- opt$cov_par ## current covariance parameter list
ga_steps[1] <- opt$iter
## store posterior mean and variance of u|y,theta,xu
upost <- list()
upost[[1]] <- list("mean" = opt$u_mean, "var" = opt$u_var)
numknots <- nrow(xu_start) ## keep track of the number of knots
iter <- 1
## do the OAT knot optimization
while(ifelse(test = chooseK, yes = numknots < maxknot && ifelse(test = length(obj_fun_vals) == 1, yes = TRUE,
no = ifelse(test = current_obj_fun - obj_fun_vals[length(obj_fun_vals) - 1] > tol_knot,
yes = TRUE, no = FALSE)),
no = numknots < maxknot)
)
{
iter <- iter + 1
if(verbose == TRUE)
{
print(iter)
}
## propose a new knot
# knot_prop <- proposal_fun(opt, ...)
knot_prop <- proposal_fun(norm_opt = opt, obj_fun = obj_fun,
y = y, cov_fun = cov_fun, opt = opt_master, ...)
if(verbose == TRUE)
{
print(knot_prop)
}
## optimize the covariance parameters and knot location
opt <- norm_grad_ascent_vi(cov_par_start = cov_par,
cov_fun = cov_fun,
dcov_fun_dtheta = dcov_fun_dtheta,
dcov_fun_dknot = ifelse(test = cov_diff == TRUE,
yes = dcov_fun_dknot,
no = NA),
xu = rbind(xu, knot_prop),
knot_opt = ifelse(test = cov_diff == TRUE,
yes = (nrow(xu) + 1):(nrow(xu) + nrow(knot_prop)),
no = NA),
xy = xy, ff = NA, y = y,
mu = mu, muu = c(muu, muu[1]),
# transform = transform,
obj_fun = obj_fun,
opt = opt_master,
verbose = verbose, ...
)
ga_steps[iter] <- opt$iter
current_obj_fun <- opt$obj_fun[length(opt$obj_fun)]
obj_fun_vals[iter] <- opt$obj_fun[length(opt$obj_fun)]
if(ifelse(test = chooseK,
yes = current_obj_fun - obj_fun_vals[length(obj_fun_vals) - 1] > tol_knot,
no = TRUE)
)
{
cov_par_vals <- rbind(cov_par_vals, as.numeric(opt$cov_par))
## store posterior mean and variance of u|y,theta,xu
upost[[iter]] <- list("mean" = opt$u_mean, "var" = opt$u_var)
xu <- opt$xu
muu <- c(muu, muu[1])
cov_par <- opt$cov_par
numknots <- nrow(xu) ## keep track of the number of knots
}
}
if(numknots == maxknot)
{
return(list("cov_par" = cov_par,
"cov_fun" = cov_fun,
"xu" = xu,
"obj_fun" = obj_fun_vals,
"u_mean" = upost[[iter]]$mean,
"muu" = muu,
"mu" = mu,
"u_var" = upost[[iter]]$var,
"cov_par_history" = cov_par_vals,
"ga_steps" = ga_steps,
"upost" = upost))
}
else{
return(list("cov_par" = cov_par,
"cov_fun" = cov_fun,
"xu" = xu,
"obj_fun" = obj_fun_vals,
"u_mean" = upost[[iter - 1]]$mean,
"muu" = muu,
"mu" = mu,
"u_var" = upost[[iter - 1]]$var,
"cov_par_history" = cov_par_vals,
"ga_steps" = ga_steps,
"u_post" = upost))
}
}
## variational inference knot proposal functions
## knot prop EGO
#' @export
knot_prop_ego_norm_vi <- function(norm_opt,
# obj_fun,
# grad_loglik_fn,
# d2log_py_dff,
# maxit_nr = 2000,
# tol_nr = 1e-5,
# y,
opt = list(), ...)
{
## norm_opt is a list of output returned from VI gradient ascent function
## obj_fun is the objective function
## ... should contain the covariance function to be used to make proposals
## call this function ego_cov_fun(x1, x2, cov_par)
## ... Must also contain the argument TTmin denoting the minimum number of allowed objective function evaluations
## ... Must also contain the argument TTmin denoting the maximum number of allowed objective function evaluations
## ... Must also contain the argument ego_cov_par: covariance parameters for the meta model GP
## ... Must also contain the argument ego_cov_fun: covariance parameters for the meta model GP
## ... Must also contain the argument ego_dcov_fun_dtheta: a list of functions which corresponds
## (in order) to the covariance parameters in
## cov_par. These functions compute the derivatives of the
## covariance function wrt the particular covariance parameter.
## ... Must also contain the argument predict_laplace: to get predictive variances for the first prediction
## extract names of optional arguments from opt
opt_master = list("optim_method" = "adadelta",
"decay" = 0.95, "epsilon" = 1e-6, "learn_rate" = 1e-2, "eta" = 0.8,
"maxit" = 1000, "obj_tol" = 1e-3, "grad_tol" = Inf, "maxit_nr" = 1000, "delta" = 1e-6,
"tol_knot" = 1e-3, "maxknot" = 30, "tol_nr" = 1e-5, "TTmin" = 10, "TTmax" = 40,
"ego_cov_par" = list("sigma" = 3, "l" = 1, "tau" = 0), "ego_cov_fun" = "exp",
"ego_cov_fun_dtheta" = list("sigma" = dexp_dsigma,
"l" = dexp_dl))
## potentially change default values in opt_master to user supplied values
if(length(opt) > 0)
{
for(i in 1:length(opt))
{
## if an argument is misspelled or not accepted, throw an error
if(!any(names(opt_master) == names(opt)[i]))
{
# print("Warning: invalid or unnecessary argument to opt(). Proceeding with fingers crossed.")
next
}
else{
ind <- which(names(opt_master) == names(opt)[i])
opt_master[[ind]] <- opt[[i]]
}
}
}
optim_method = opt_master$optim_method
optim_par = list("decay" = opt_master$decay,
"epsilon" = opt_master$epsilon,
"eta" = opt_master$eta,
"learn_rate" = opt_master$learn_rate)
maxit = opt_master$maxit
obj_tol = opt_master$obj_tol
grad_tol = opt_master$grad_tol
delta <- opt_master$delta
tol_knot <- opt_master$tol_knot
maxknot <- opt_master$maxknot
TTmin <- opt_master$TTmin
TTmax <- opt_master$TTmax
ego_cov_par <- opt_master$ego_cov_par
ego_cov_fun <- opt_master$ego_cov_fun
ego_dcov_fun_dtheta <- opt_master$ego_cov_fun_dtheta
args <- list(...)
y <- args$y
obj_fun <- args$obj_fun
predict_laplace <- args$predict_laplace
cov_fun <- args$cov_fun
## store current knot locations
xu <- norm_opt$xu
cov_par <- norm_opt$cov_par
xy <- norm_opt$xy
## store objective function values
# obj_fun_vals <- rep(norm_opt$obj_fun[length(norm_opt$obj_fun)], times = nrow(xu))
obj_fun_vals <- numeric()
## first set of proposals for the new knot
# pred_vars <- as.numeric(diag(predict_laplace(u_mean = norm_opt$u_mean,
# u_var = norm_opt$u_var,
# xu = norm_opt$xu,
# x_pred = norm_opt$xy,
# cov_fun = norm_opt$cov_fun,
# cov_par = norm_opt$cov_par,
# mu = norm_opt$mu,
# muu = norm_opt$muu)$pred_var))
temp1 <- rbind(xu, xy)
if(any(duplicated(temp1)))
{
temp2 <- temp1[-which(duplicated(x = temp1)),]
xy_setminus_xu <- temp2[-c(1:nrow(xu)),]
}
else{
xy_setminus_xu <- xy
}
pseudo_prop <- matrix(xy_setminus_xu[sample.int(n = nrow(xy_setminus_xu), size = TTmin, replace = FALSE),],
ncol = ncol(xy_setminus_xu), nrow = TTmin)
## meta model x values
for(i in 1:nrow(pseudo_prop))
{
# pseudo_xu <- rbind(xu, pseudo_prop[i,])
pseudo_xu <- rbind(xu, pseudo_prop[i,])
## Create data GP covariance matrices
if(norm_opt$cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(norm_opt$xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
else{
Sigma12 <- make_cov_matC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun, delta = delta) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
# Z3 <- Sigma12 * t(Z2)
# Z4 <- apply(X = Z3, MARGIN = 1, FUN = sum)
Z <- rep(norm_opt$cov_par$tau^2 + delta, times = nrow(norm_opt$xy))
## meta model y values
obj_fun_eval <- try(obj_fun(mu = norm_opt$mu, Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22, y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par, delta = delta))
if(class(obj_fun_eval) == "try-error")
{
while(class(obj_fun_eval) == "try-error")
{
pseudo_prop[i,] <- xy_setminus_xu[sample.int(n = nrow(xy_setminus_xu), size = 1, replace = FALSE),] +
rnorm(n = length(pseudo_prop[i,]), mean = 0, sd = 1e-6)
pseudo_xu <- rbind(xu, pseudo_prop[i,])
## Create data GP covariance matrices
if(norm_opt$cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(norm_opt$xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
else{
Sigma12 <- make_cov_matC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun, delta = delta) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
# Z3 <- Sigma12 * t(Z2)
# Z4 <- apply(X = Z3, MARGIN = 1, FUN = sum)
Z <- rep(norm_opt$cov_par$tau^2 + delta, times = nrow(xy))
## meta model y values
obj_fun_eval <- try(obj_fun(mu = norm_opt$mu, Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par,
delta = delta))
}
}
obj_fun_vals <- c(obj_fun_vals, obj_fun_eval)
# obj_fun_vals <- c(obj_fun_eval)
}
# obj_fun_x <- rbind(xu, pseudo_prop)
obj_fun_x <- rbind(pseudo_prop)
## get meta model parameters
## meta model GP negative log likelihood function
meta_mod_obj_fun <- function(cov_par, ...)
{
# y, x, cov_fun, mu
args <- list(...)
x <- args$x
obj_fun_vals <- args$obj_fun_vals
cov_fun <- args$cov_fun
mu <- args$mu
cov_par_names <- args$cov_par_names
tau <- args$tau
delta <- args$delta
cov_par <- as.list(exp(cov_par))
names(cov_par) <- cov_par_names
cov_par$tau <- tau
# print(cov_par)
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(x), sep = "")
Sig <- make_cov_mat_ardC(x = x, x_pred = matrix(),
cov_fun = cov_fun, cov_par = cov_par,
delta = delta, lnames = lnames)
}
else{
Sig <- make_cov_matC(x = x, x_pred = matrix(), cov_fun = cov_fun, cov_par = cov_par, delta = delta)
}
# print(Sig)
L <- t(chol(Sig))
mu <- rep(mu,times = length(obj_fun_vals))
temp <- -1 * (-1 * sum(log(diag(L))) - 1/2 * t(solve(a = L, b = obj_fun_vals - mu)) %*% solve(a = L, b = obj_fun_vals - mu))
return(temp)
}
## function to create d(Sigma)/dtheta
dsig_dtheta <- function(cov_par, par_name, dcov_fun_dtheta, x, transform = TRUE)
{
mat <- matrix(nrow = nrow(x), ncol = nrow(x))
for(i in 1:nrow(mat))
{
for(j in 1:ncol(mat))
{
temp <- eval(
substitute(expr = dcov_fun_dtheta$par(x1 = a, x2 = b, cov_par = c, transform = d),
env = list("a" = x[i,], "b" = x[j,], "c" = cov_par, "par" = par_name, "d" = transform))
)
mat[i,j] <- temp$derivative
}
}
return(mat)
}
## meta model gradient function
meta_mod_grad <- function(cov_par, ...)
{
# y, x, cov_fun, mu, dcov_fun_dtheta, transform = TRUE
args <- list(...)
x <- args$x
obj_fun_vals <- args$obj_fun_vals
cov_fun <- args$cov_fun
dcov_fun_dtheta <- args$dcov_fun_dtheta
transform <- args$transform
cov_par <- as.list(exp(cov_par))
cov_par_names <- args$cov_par_names
names(cov_par) <- cov_par_names
mu <- args$mu
mu <- rep(mu, times = nrow(x))
tau <- args$tau
cov_par$tau <- tau
delta <- args$delta
## store gradient values
grad <- numeric(length(dcov_fun_dtheta))
## create current covariancne matrix
if(cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(x), sep = "")
Sig <- make_cov_mat_ardC(x = x, x_pred = matrix(), cov_fun = cov_fun,
cov_par = cov_par, delta = delta, lnames = lnames)
}
else{
Sig <- make_cov_matC(x = x, x_pred = matrix(), cov_fun = cov_fun, cov_par = cov_par, delta = delta)
}
L <- t(chol(Sig))
## loop through each covariance parameter
for(p in 1:length(dcov_fun_dtheta))
{
par_name <- names(cov_par)[p]
## next compute dSigma/dtheta_j
dSigma_dtheta <- dsig_dtheta(par_name = par_name,
cov_par = cov_par,
dcov_fun_dtheta = dcov_fun_dtheta,
x = x,
transform = transform)
grad[p] <- -(
-1/2 * sum(diag( solve(a = t(L), b = solve(a = L, b = dSigma_dtheta)) )) +
1/2 * t(solve(a = L, b = obj_fun_vals - mu)) %*% solve(a = L, b = dSigma_dtheta) %*% solve(a = t(L), b = solve(a = L, b = obj_fun_vals - mu))
)
}
return(grad)
}
## optimize meta model covariance parameters
temp_opt <- try(optim(par = log(as.numeric(ego_cov_par))[1:2], fn = meta_mod_obj_fun, gr = meta_mod_grad, method = "BFGS",
obj_fun_vals = obj_fun_vals, x = obj_fun_x, cov_fun = ego_cov_fun,
mu = obj_fun_vals[1], dcov_fun_dtheta = ego_dcov_fun_dtheta,
transform = TRUE, cov_par_names = names(ego_cov_par)[1:2], tau = ego_cov_par$tau, delta = delta))
## If optim hits values of sigma that cause infinite variance or zero length scale, then set parameter values
## to something semi reasonable
if(class(temp_opt) == "try-error")
{
print("Error: Optimizing meta GP was unsuccessful - likely due to extreme parameter values. Setting covariance parameters manually.")
temp_opt <- list()
temp_opt$par <- c(log(max(obj_fun_vals) - min(obj_fun_vals)), log( (max(obj_fun_x) - min(obj_fun_x))/1000 ))
}
cov_par_names <- names(ego_cov_par)
ego_cov_par <- as.list(c(exp(temp_opt$par), ego_cov_par$tau))
names(ego_cov_par) <- cov_par_names
K12 <- make_cov_matC(x = xy_setminus_xu, x_pred = obj_fun_x, cov_par = ego_cov_par, cov_fun = ego_cov_fun, delta = delta / 1000)
K22 <- make_cov_matC(x = obj_fun_x, x_pred = matrix(), cov_fun = ego_cov_fun, cov_par = ego_cov_par, delta = delta / 1000)
pred_obj_fun <- rep(obj_fun_vals[1], times = nrow(K12)) + as.numeric(K12 %*% solve(a = K22, b = obj_fun_vals - rep(obj_fun_vals[1], times = ncol(K12)))) ## objective function predictions
obj_fun_vars <- numeric() ## objective function marginal variances
std_vals <- numeric() ## standardized predictions
EI <- numeric() ## expected improvement
for(i in 1:nrow(K12))
{
obj_fun_vars[i] <- ego_cov_par$sigma^2 + ego_cov_par$tau^2 - K12[i,] %*% solve(a = K22, b = t(t(K12[i,])))
if(obj_fun_vars[i] > ego_cov_par$tau^2)
{
std_vals[i] <- (pred_obj_fun[i] - max(obj_fun_vals)) / sqrt(obj_fun_vars[i])
EI[i] <- sqrt(obj_fun_vars[i]) * ( std_vals[i] * pnorm(q = std_vals[i], mean = 0, sd = 1) + dnorm(x = std_vals[i], mean = 0, sd = 1) )
}
else{EI[i] <- 0}
}
iter <- TTmin
while(iter < TTmax)
{
pseudo_prop <- matrix(xy_setminus_xu[which.max(EI),], nrow = 1, ncol = ncol(xy_setminus_xu))
# print(pseudo_prop)
# print(obj_fun_x)
# if(
# !is.na(prodlim::row.match(x = as.data.frame(pseudo_prop),
# table = as.data.frame(obj_fun_x)))
# )
# {
# break
# }
## meta model x values
pseudo_xu <- rbind(xu, pseudo_prop)
## Create data GP covariance matrices
if(norm_opt$cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(norm_opt$xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta,
lnames = lnames) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
else{
Sigma12 <- make_cov_matC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun, delta = delta) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
# Z3 <- Sigma12 * t(Z2)
# Z4 <- apply(X = Z3, MARGIN = 1, FUN = sum)
Z <- rep(norm_opt$cov_par$tau^2 + delta, times = nrow(norm_opt$xy))
## meta model y values
obj_fun_eval <- try(obj_fun(mu = norm_opt$mu, Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par,
delta = delta))
if(class(obj_fun_eval) == "try-error")
{
while(class(obj_fun_eval) == "try-error")
{
pseudo_prop <- xy_setminus_xu[sample.int(n = nrow(xy_setminus_xu), size = 1, replace = FALSE),] +
rnorm(n = length(pseudo_prop), mean = 0, sd = 1e-6)
pseudo_xu <- rbind(xu, pseudo_prop)
## Create data GP covariance matrices
if(norm_opt$cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(norm_opt$xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta,
lnames = lnames) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
else{
Sigma12 <- make_cov_matC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun, delta = delta) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
# Z3 <- Sigma12 * t(Z2)
# Z4 <- apply(X = Z3, MARGIN = 1, FUN = sum)
Z <- rep(norm_opt$cov_par$tau^2 + delta, times = nrow(norm_opt$xy))
## meta model y values
obj_fun_eval <- try(obj_fun(mu = norm_opt$mu,
Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y,
trace_term_fun = trace_term_fun,
cov_par = cov_par,
delta = delta))
}
}
obj_fun_vals <- c(obj_fun_vals, obj_fun_eval)
# obj_fun_vals <- c(obj_fun_vals, norm_opt$objective_function_values[length(norm_opt$objective_function_values)])
obj_fun_x <- rbind(obj_fun_x, pseudo_prop)
## optimize meta model covariance parameters
temp_opt <- try(optim(par = log(as.numeric(ego_cov_par))[1:2], fn = meta_mod_obj_fun, gr = meta_mod_grad, method = "BFGS",
obj_fun_vals = obj_fun_vals, x = obj_fun_x, cov_fun = ego_cov_fun,
mu = obj_fun_vals[1], dcov_fun_dtheta = ego_dcov_fun_dtheta,
transform = TRUE, cov_par_names = names(ego_cov_par)[1:2], tau = ego_cov_par$tau, delta = delta))
## If optim hits values of sigma that cause infinite variance or zero length scale, then set parameter values
## to something semi reasonable
if(class(temp_opt) == "try-error")
{
print("Error: Optimizing meta GP was unsuccessful - likely due to extreme parameter values. Setting covariance parameters manually.")
temp_opt <- list()
temp_opt$par <- c(log(max(obj_fun_vals) - min(obj_fun_vals)), log( (max(obj_fun_x) - min(obj_fun_x))/1000 ))
}
cov_par_names <- names(ego_cov_par)
ego_cov_par <- as.list(c(exp(temp_opt$par), ego_cov_par$tau))
names(ego_cov_par) <- cov_par_names
K12 <- make_cov_matC(x = xy_setminus_xu, x_pred = obj_fun_x, cov_par = ego_cov_par, cov_fun = ego_cov_fun, delta = delta / 1000)
K22 <- make_cov_matC(x = obj_fun_x, x_pred = matrix(), cov_fun = ego_cov_fun, cov_par = ego_cov_par, delta = delta / 1000)
pred_obj_fun <- rep(obj_fun_vals[1], times = nrow(K12)) + as.numeric(K12 %*% solve(a = K22, b = obj_fun_vals - rep(obj_fun_vals[1], times = ncol(K12)))) ## objective function predictions
obj_fun_vars <- numeric() ## objective function marginal variances
std_vals <- numeric() ## standardized predictions
EI <- numeric() ## expected improvement
for(i in 1:nrow(K12))
{
obj_fun_vars[i] <- ego_cov_par$sigma^2 + ego_cov_par$tau^2 - K12[i,] %*% solve(a = K22, b = t(t(K12[i,])))
if(obj_fun_vars[i] > ego_cov_par$tau^2)
{
std_vals[i] <- (pred_obj_fun[i] - max(obj_fun_vals)) / sqrt(obj_fun_vars[i])
EI[i] <- sqrt(obj_fun_vars[i]) * ( std_vals[i] * pnorm(q = std_vals[i], mean = 0, sd = 1) + dnorm(x = std_vals[i], mean = 0, sd = 1) )
}
else{EI[i] <- 0}
}
# std_vals <- (pred_obj_fun - max(obj_fun_vals)) / sqrt(obj_fun_vars)
# EI <- sqrt(obj_fun_vars) * ( std_vals * pnorm(q = std_vals, mean = 0, sd = 1) + dnorm(x = std_vals, mean = 0, sd = 1) )
iter <- iter + 1
}
# return(matrix(nrow = 1, ncol = ncol(xu),
# data = matrix(obj_fun_x[(nrow(xu) + 1):nrow(obj_fun_x),], ncol = ncol(obj_fun_x))[which.max(obj_fun_vals[(nrow(xu) + 1):nrow(obj_fun_x)]), ]))
return(matrix(nrow = 1, ncol = ncol(xu),
data = obj_fun_x[which.max(obj_fun_vals),]))
}
## Knot proposal based on a random subset of data locations
#' @export
knot_prop_random_norm_vi <- function(norm_opt,
# obj_fun,
# grad_loglik_fn,
# d2log_py_dff,
# maxit_nr = 2000,
# tol_nr = 1e-5,
# y,
opt = list(), ...)
{
## laplace_opt is a list of output returned from laplace_grad_ascent
## obj_fun is the objective function
## ... should contain the covariance function to be used to make proposals
## call this function ego_cov_fun(x1, x2, cov_par)
## ... Must also contain the argument TTmin denoting the minimum number of allowed objective function evaluations
## ... Must also contain the argument TTmin denoting the maximum number of allowed objective function evaluations
## ... Must also contain the argument ego_cov_par: covariance parameters for the meta model GP
## ... Must also contain the argument ego_cov_fun: covariance parameters for the meta model GP
## ... Must also contain the argument ego_dcov_fun_dtheta: a list of functions which corresponds
## (in order) to the covariance parameters in
## cov_par. These functions compute the derivatives of the
## covariance function wrt the particular covariance parameter.
## ... Must also contain the argument predict_laplace: to get predictive variances for the first prediction
## extract names of optional arguments from opt
opt_master = list("optim_method" = "adadelta",
"decay" = 0.95, "epsilon" = 1e-6, "learn_rate" = 1e-2, "eta" = 0.8,
"maxit" = 1000, "obj_tol" = 1e-3, "grad_tol" = 1e-1, "maxit_nr" = 1000, "delta" = 1e-6,
"tol_knot" = 1e-1, "maxknot" = 30, "TTmax" = 40)
## potentially change default values in opt_master to user supplied values
if(length(opt) > 0)
{
for(i in 1:length(opt))
{
## if an argument is misspelled or not accepted, throw an error
if(!any(names(opt_master) == names(opt)[i]))
{
# print("Warning: invalid or unnecessary argument to opt(). Proceeding with fingers crossed.")
next
# return()
}
else{
ind <- which(names(opt_master) == names(opt)[i])
opt_master[[ind]] <- opt[[i]]
}
}
}
optim_method = opt_master$optim_method
optim_par = list("decay" = opt_master$decay,
"epsilon" = opt_master$epsilon,
"eta" = opt_master$eta,
"learn_rate" = opt_master$learn_rate)
maxit = opt_master$maxit
obj_tol = opt_master$obj_tol
grad_tol = opt_master$grad_tol
delta <- opt_master$delta
tol_knot <- opt_master$tol_knot
maxknot <- opt_master$maxknot
TTmin <- opt_master$TTmin
TTmax <- opt_master$TTmax
ego_cov_par <- opt_master$ego_cov_par
ego_cov_fun <- opt_master$ego_cov_fun
ego_dcov_fun_dtheta <- opt_master$ego_cov_fun_dtheta
args <- list(...)
y <- args$y
obj_fun <- args$obj_fun
predict_laplace <- args$predict_laplace
cov_fun <- args$cov_fun
## store current knot locations
xu <- norm_opt$xu
cov_par <- norm_opt$cov_par
xy <- norm_opt$xy
## store objective function values
obj_fun_vals <- rep(norm_opt$obj_fun[length(norm_opt$obj_fun)], times = nrow(xu))
## first set of proposals for the new knot
# pred_vars <- as.numeric(diag(predict_laplace(u_mean = norm_opt$u_mean,
# u_var = norm_opt$u_var,
# xu = norm_opt$xu,
# x_pred = norm_opt$xy,
# cov_fun = norm_opt$cov_fun,
# cov_par = norm_opt$cov_par,
# mu = norm_opt$mu,
# muu = norm_opt$muu)$pred_var))
temp1 <- rbind(xu, xy)
if(any(duplicated(temp1)))
{
temp2 <- temp1[-which(duplicated(x = temp1)),]
xy_setminus_xu <- temp2[-c(1:nrow(xu)),]
}
else{
xy_setminus_xu <- xy
}
pseudo_prop <- matrix(xy_setminus_xu[sample.int(n = nrow(xy_setminus_xu), size = TTmax, replace = FALSE),],
ncol = ncol(xy_setminus_xu), nrow = TTmax)
## meta model x values
for(i in 1:nrow(pseudo_prop))
{
pseudo_xu <- rbind(xu, pseudo_prop[i,])
## Create data GP covariance matrices
if(norm_opt$cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(norm_opt$xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta,
lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta,
lnames = lnames) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
else{
Sigma12 <- make_cov_matC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun, delta = delta) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
## create Z
Z <- rep(norm_opt$cov_par$tau^2 + delta, times = nrow(norm_opt$xy))
## meta model y values
obj_fun_eval <- try(obj_fun(mu = norm_opt$mu, Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y, cov_par = cov_par,
trace_term_fun = trace_term_fun,
delta = delta))
if(class(obj_fun_eval) == "try-error")
{
while(class(obj_fun_eval == "try-error"))
{
pseudo_prop[i,] <- xy_setminus_xu[sample.int(n = nrow(xy_setminus_xu), size = 1, replace = FALSE),] +
rnorm(n = length(pseudo_prop[i,]), mean = 0, sd = 1e-6)
pseudo_xu <- rbind(xu, pseudo_prop[i,])
## Create data GP covariance matrices
if(norm_opt$cov_fun == "ard")
{
lnames <- paste("l", 1:ncol(norm_opt$xy), sep = "")
Sigma12 <- make_cov_mat_ardC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames)
Sigma22 <- make_cov_mat_ardC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun,
delta = delta, lnames = lnames) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
else{
Sigma12 <- make_cov_matC(x = norm_opt$xy, x_pred = pseudo_xu,
cov_par = norm_opt$cov_par, cov_fun = norm_opt$cov_fun, delta = delta)
Sigma22 <- make_cov_matC(x = pseudo_xu, x_pred = matrix(),
cov_par = norm_opt$cov_par,
cov_fun = norm_opt$cov_fun, delta = delta) - as.list(norm_opt$cov_par)$tau^2 * diag(nrow(pseudo_xu))
}
## create Z
# Z2 <- solve(a = Sigma22, b = t(Sigma12))
# Z3 <- Sigma12 * t(Z2)
# Z4 <- apply(X = Z3, MARGIN = 1, FUN = sum)
Z <- rep(norm_opt$cov_par$tau^2 + delta, times = nrow(norm_opt$xy))
## meta model y values
obj_fun_eval <- try(obj_fun(mu = norm_opt$mu,
Z = Z,
Sigma12 = Sigma12,
Sigma22 = Sigma22,
y = y, cov_par = cov_par,
trace_term_fun = trace_term_fun,
delta = delta))
}
}
obj_fun_vals <- c(obj_fun_vals, obj_fun_eval)
}
obj_fun_x <- rbind(xu, pseudo_prop)
# return(matrix(nrow = 1, ncol = ncol(xu),
# data = matrix(obj_fun_x[(nrow(xu) + 1):nrow(obj_fun_x),], ncol = ncol(obj_fun_x))[which.max(obj_fun_vals[(nrow(xu) + 1):nrow(obj_fun_x)]), ]))
return(matrix(nrow = 1, ncol = ncol(xu),
data = obj_fun_x[which.max(obj_fun_vals),]))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.