dolka: Utility functions for Bayesian optimization

Documented in qEI.grad

decompo <- function(x, mu, Sigma, k) {
    ## ### 
    if (length(k) == length(x[ , 1])) {
        return(dmnorm(c(x), c(mu), Sigma))
    }
    
    x1 <- x[k, 1]
    x2 <- x[-k, 1]
    mu1 <- mu[k, 1]
    Sig22 <- Sigma[-k, -k]
    Sig21 <- matrix(Sigma[-k, k], ncol = length(k))
    Sig12 <- t(Sig21)
    Sig11 <- Sigma[k, k]
    Sig11Inv <- solve(Sig11)
    varcov <- Sig22 - Sig21 %*% Sig11Inv %*% Sig12
    varcov <- 0.5 * (varcov + t(varcov))
    if (min(diag(varcov)) <= 0) {
        diag(varcov) <- diag(varcov) * (diag(varcov) >= 0) + 1e-9
    }
    moy <- mu[-k, 1] + Sig21 %*% Sig11Inv %*% (x1 - mu1)
    
    
    return (
        dmnorm(x = x1,  mu[k, 1], varcov = Sig11) *
        pmnorm(x = t(matrix(x2)), mean = t(moy), varcov = varcov,
               maxpts = (length(moy) * 200))
    )
}

## gradient of CDF
GPhi <- function(x, mu, Sigma) {
    q <- length(x[ , 1])
    res <- matrix(NaN, q, 1)
    for (i in 1:q) {
        res[i, 1] <- decompo(x, mu, Sigma, i)
    }
    return( res )
}

## hessian matrix of CDF
HPhi <- function(x, mu, Sigma, gradient = NULL) {
    q <- length(x[ , 1])
    if (q == 1) {
        res <- -(x - mu) / Sigma * dnorm(x, mu, sqrt(Sigma))
    }
    else {
        res <- matrix(0,q,q)
        for (i in 1:(q - 1)) { 
            for (j in (i + 1):q) {
                res[i, j] <- decompo(x, mu, Sigma, c(i, j))
            }
        }
        ## hessian matrix is symetric diagonal terms can be computed
        ## with the gradient of CDF and the other hessian terms
        res <- res + t(res)
        
        if(is.null(gradient)) {
            res <- res - diag(((c(x) - c(mu)) * c(GPhi(x, mu, Sigma)) +
                               diag(Sigma %*% res)) / diag(Sigma))
        } else {
            res <- res - diag(((c(x) - c(mu)) * c(gradient) +
                               diag(Sigma %*% res)) / diag(Sigma))
        }
    }
    return(res)
}

## *****************************************************************************
##' @description Compute an exact or approximate radient of the
##'     multipoint expected improvement criterion.
##' 
##' @title Gradient of the Multipoint Expected Improvement (qEI) Criterion
##' 
##' @param x a matrix representing the set of input points (one row corresponds
##' to one point) where to evaluate the gradient,
##'
##' @param model an object of class \code{\link[DiceKriging]{km}}.
##'
##' @param plugin optional scalar: if provided, it replaces the
##'     minimum of the current observations.
##'
##' @param type "SK" or "UK" (by default), depending whether
##'     uncertainty related to trend estimation has to be taken into
##'     account,
##'
##' @param minimization logical specifying if EI is used in
##'     minimiziation or in maximization.
##'
##' @param fastCompute if TRUE, a fast approximation method based on a
##'     semi-analytic formula is used (see [Marmin 2014] for details).
##'
##' @param eps the value of \emph{epsilon} of the fast computation
##'     trick.  Relevant only if \code{fastComputation} is TRUE.
##' 
##' @param envir an optional environment specifying where to get
##'     intermediate values calculated in \code{\link{qEI}}.
##'
##' @return The gradient of the multipoint expected improvement
##'     criterion with respect to \code{x}. A 0-matrix is returned if
##'     the batch of input points contains twice the same point or a
##'     point from the design experiment of the km object (the
##'     gradient does not exist in these cases).
##'
##' @section Caution: When \code{envir} is not \code{NULL} this
##'     environment is assumed to contain the kriging predictions with
##'     the derivatives evaluated. The user should make sure that the
##'     model used to compute the prediction is the model used here
##'     and that the prediction was made with the same value for the
##'     argument \code{type}.
##' 
##' @author Sebastien Marmin, Clement Chevalier and David Ginsbourger.
##'
##' @seealso \code{\link{qEI}}
##'
##' @references
##'
##' S. Marmin, C. Chevalier and  D. Ginsbourger (2019)
##' Differentiating the multipoint Expected Improvement for optimal batch design.
##' arXiv:1503.05509.
##' 
##' C. Chevalier and D. Ginsbourger (2014) Learning and Intelligent
##' Optimization - 7th International Conference, Lion 7, Catania, Italy,
##' January 7-11, 2013, Revised Selected Papers, chapter Fast computation of
##' the multipoint Expected Improvement with applications in batch selection,
##' pages 59-69, Springer.
##' 
##' D. Ginsbourger, R. Le Riche, L. Carraro (2007), A Multipoint Criterion for
##' Deterministic Parallel Global Optimization based on Kriging. The
##' International Conference on Non Convex Programming, 2007.
##' 
##' D. Ginsbourger, R. Le Riche, and L. Carraro. Kriging is well-suited to
##' parallelize optimization (2010), In Lim Meng Hiot, Yew Soon Ong, Yoel
##' Tenne, and Chi-Keong Goh, editors, \emph{Computational Intelligence in
##' Expensive Optimization Problems}, Adaptation Learning and Optimization,
##' pages 131-162. Springer Berlin Heidelberg.
##' 
##' S. Marmin. Developpements pour l'evaluation et la maximisation du critere
##' d'amelioration esperee multipoint en optimisation globale (2014). Master's
##' thesis, Mines Saint-Etienne (France) and University of Bern (Switzerland).
##' 
##' J. Mockus (1988), \emph{Bayesian Approach to Global Optimization}. Kluwer
##' academic publishers.
##' 
##' M. Schonlau (1997), \emph{Computer experiments and global optimization},
##' Ph.D. thesis, University of Waterloo.
##' 
##' @keywords models optimize
##'
##' @export 
##' 
##' @examples
##' set.seed(15)
##' # Example 1 - validation by comparison to finite difference approximations
##' 
##' # a 9-points factorial design, and the corresponding response
##' d <- 2
##' n <- 9
##' design <- expand.grid(x1 = seq(0, 1, length = 3),  x2 = seq(0, 1, length = 3)) 
##' y <- apply(design, 1, branin)
##' 
##' # learning
##' model <- km(~1, design = design, response = y)
##' 
##' # pick up 2 points sampled from the simple expected improvement
##' q <- 2  # increase to 4 for a more meaningful test
##' X <- sampleFromEI(model, n = q)
##' 
##' # compute the gradient at the 4-point batch
##' grad.analytic <- qEI.grad(X, model)
##' 
##' # numerically compute the gradient
##' grad.numeric <- matrix(NaN, q, d)
##' eps <- 1e-6
##' EPS <- matrix(0,q,d)
##' for (i in 1:q) {
##'     for (j in 1:d) {
##'         EPS[i, j] <- eps
##'         grad.numeric[i,j] <- (qEI(X + EPS, model, fastCompute = FALSE)-
##'                               qEI(X, model, fastCompute = FALSE)) / eps
##'         EPS[i, j] <- 0
##'      }  
##' }
##' print(grad.numeric)
##' print(grad.analytic)
##'
##' \dontrun{
##'      lower <- c(0, 0); upper <- c(1, 1)
##'      # graphics: displays the EI criterion, the design points in black, 
##'      # the batch points in red and the gradient in blue.
##'      nGrid <- 50
##'      gridAxe1 <- seq(lower[1], upper[1], length = nGrid)
##'      gridAxe2 <- seq(lower[2], upper[2], length = nGrid)
##'      grid <- expand.grid(gridAxe1, gridAxe2)
##'      aa <- apply(grid, 1, EI, model = model)
##'      myMat <- matrix(aa, nrow = nGrid)
##'      image(x = gridAxe1, y = gridAxe2, z = myMat, 
##'            col = colorRampPalette(c("darkgray", "white"))(5 * 10), 
##'            ylab = names(design)[1], xlab = names(design)[2], 
##'            main = sprintf("qEI - gradient of a batch of %d points", q),
##'            axes = TRUE, 
##'            zlim = c(min(myMat), max(myMat)))
##'      contour(x = gridAxe1, y = gridAxe2, z = myMat, 
##'             add = TRUE, nlevels = 10)
##'      points(X[ , 1], X[ , 2], pch = 19, col = 'red')
##'      points(model@X[ , 1], model@X[ , 2], pch = 19)
##'      arrows(X[ , 1], X[ , 2],
##'             X[ , 1] + 0.012 * grad.analytic[ , 1],
##'             X[ , 2] + 0.012 * grad.analytic[ , 2], col = 'blue')
##' }
qEI.grad <- function(x, model,
                     plugin = NULL,
                     type = c("UK", "SK"),
                     minimization = TRUE, fastCompute = TRUE,
                     eps = 1e-6,
                     envir = NULL) {
    
    type <- match.arg(type)
    
    ## compteur_global <<- compteur_global + 1
    
    if (!minimization) {
        if (is.null(plugin)) {
            plugin <- -max(model@y)
        } 
    }
    
    d <- model@d
    if (!is.matrix(x)) x <- matrix(x, ncol = d)
    
    q <- nrow(x)
    xb <- rbind(model@X, x)
    ux <- unique(round(xb, digits = 8))
    if(length(xb[ , 1]) != length(ux[ , 1])) {
        return (matrix(0, q, d))
    }
    
    if (is.null(plugin)) plugin <- min(model@y)

    if (q == 1) {
        if (!minimization) {
            stop("'qEI.grad' doesn't work in \'minimization = FALSE\' when ",
                 "dim = 1 (in progress).")
        }
        return(EI.grad(x, model, plugin, type, envir))
    }
    
    ## ==========================================================================
    ## XXXY the variable 'pk' does not seem to be used later!
    ## ==========================================================================

    if (!is.null(envir)) {
        
        if (fastCompute == TRUE) {
            toget <- matrix(c("pk"), 1, 1)
            apply(toget, 1, get, envir = envir)
            pk <- envir$pk
        } else {
            toget <- matrix(c("pk","symetric_term"), 2, 1)
            apply(toget, 1, get, envir = envir)
            pk <- envir$pk
            symetric_term <- envir$symetric_term
        }
    }

    ## ==========================================================================
    ## Get kriging (conditionnal) mean and covariance and their
    ## derivatives
    ## XXXY This code from DiceOptim is desactivated here.
    ## ==========================================================================
    
    if (FALSE) {
    
        kriging.dx <- krigingDeriv(x = x, model = model,
                                   type = type, envir = envir)
        
        if (is.null(kriging.dx)) {
            return (matrix(0, q, d))
        }
    }
    
    if (!is.null(envir)) {
        pred <- get("pred", envir = envir)
    } else {
        pred  <- predict(object = model, newdata = x, type = type,
                         se.compute = FALSE,
                         cov.compute = TRUE,
                         deriv = TRUE,
                         checkNames = FALSE)
        
    }
    
    
    ## ==========================================================================
    ## Kriging mean vector and covariance matrix
    ##
    ## o kriging.mean: vector with length q
    ##
    ## o kriging.cov:  2-dimensional array with dim c(q, q)
    ##
    ## ==========================================================================

    kriging.mean <- pred[["mean"]]
    kriging.cov <- pred[["cov"]]
    
    ## =========================================================================
    ## Jacobian arrays of the mean vector and the covariance matrix
    ##
    ## o kriging.mean.jacob: 3-dimensional array with dim c(q, d, q)
    ##
    ##          kriging.mean.jacob[l, j, k] = d(m_k) / d(x_lj)
    ##
    ## o kriging.cov.jacob: 4-dimensional array with dim  c(q, d, q, q)
    ## 
    ##          kriging.cov.jacob[l, j, k, i]= d(Sigma_ki) / (d x_lj)
    ##
    ## CAUTION Since the indexation of this code taken from DiceOptim
    ## differs from that in the predict method, 'aperm' is used.
    ## =========================================================================
    
    kriging.mean.jacob  <- aperm(pred[["mean.deriv"]], perm = c(1, 3, 2))
    kriging.cov.jacob <- aperm(pred[["cov.deriv"]], perm = c(3, 4, 1, 2))
    
    if (!minimization) {
        kriging.mean <- -kriging.mean;
        kriging.mean.jacob <- -kriging.mean.jacob
    }
    
    ## Initialisation
    EI.grad <- matrix(0, q, d) # result
    b <- matrix(rep(0, q), q, 1)
    L <- -diag(rep(1, q))
    Dpk <- matrix(0, q, d)
    if (fastCompute == TRUE) {
        termB <- matrix(0, q, d)
    }
    Sigk_dx <- array(0, dim = c(q, d, q, q))
    mk_dx <- array(0, dim = c(q, d, q))

    ## XXXY Notes YD:
    ## 
    ## o Since 'bk' and 'mk' are used through indexed forms, they both
    ## could be vectors.
    ## 
    ## o Why not use bk - mk?
    
    ## First sum of the formula
    for (k in 1:q) {
        bk <- b
        bk[k, 1] <- plugin                   # creation of vector b^(k)
        Lk <- L
        Lk[ , k] <- rep(1, q)                # linear transform: Y to Zk
        tLk <- t(Lk)
        mk <- Lk %*% kriging.mean            # mean of Zk (m^(k) in the formula)
        Sigk <- Lk %*% kriging.cov %*% tLk   # covariance of Zk
        Sigk <- 0.5 * (Sigk + t(Sigk))       # numerical symetrization
        
        ## term A1
        if (is.null(envir)) {
            EI.grad[k, ] <-  EI.grad[k, ] - kriging.mean.jacob[k, , k] *
                pmnorm(x = t(bk), mean = t(mk), varcov = Sigk, maxpts = q * 200)
        } else {
            EI.grad[k, ] <-  EI.grad[k, ] - kriging.mean.jacob[k, , k] * pk[k]
        }
        
        ## compute gradient ans hessian matrix of the CDF term pk.
        gradpk <- GPhi(x = bk - mk, mu = b, Sigma = Sigk)
        hesspk <- HPhi(x = bk, mu = mk, Sigma = Sigk, gradient = gradpk)
        
        ## term A2
        for (l in 1:q) {
            for (j in 1:d) {
                Sigk_dx[l, j, , ] <- Lk %*% kriging.cov.jacob[l, j, , ] %*% tLk
                mk_dx[l, j, ] <- Lk %*% kriging.mean.jacob[l, j, ]
                Dpk[l, j] <- 0.5 * sum(hesspk * Sigk_dx[l, j, , ]) -
                    crossprod(gradpk, mk_dx[l, j, ])
            }
        }
        EI.grad <- EI.grad + (plugin - kriging.mean[k]) * Dpk
        
        ## term B
        if (fastCompute == TRUE) {
            
            gradpk1 <- GPhi(x = bk - mk + Sigk[ , k] * eps, mu = b, Sigma = Sigk)
            hesspk1 <- HPhi(x = bk + Sigk[ , k] * eps,  mu = mk, Sigma = Sigk,
                           gradient = gradpk1)
            for (l in 1:q) {
                for (j in 1:d) {
                    f1 <- -crossprod(mk_dx[l, j, ], gradpk1) +
                        eps * crossprod(Sigk_dx[l, j, , k], gradpk1) +
                        0.5 * sum(Sigk_dx[l, j, , ] * hesspk1)
                    f  <- -crossprod(mk_dx[l, j, ], gradpk) +
                        0.5 * sum(Sigk_dx[l, j, , ] * hesspk)
                    termB[l, j] <- (f1 - f) / eps
                }
            }
        } else {
            B1 <- B2 <- B3 <- matrix(0,q,d)
            for (i in 1:q) {
                Sigk_ik <- Sigk[i, k]
                Sigk_ii <- Sigk[i, i]
                mk_i <- mk[i]
                mk_dx_i <- mk_dx[ , , i]
                bk_i <- bk[i, 1]
                ck_pi <- bk[-i, 1] - mk[-i, ] - (bk[i, 1] - mk[i, 1]) / Sigk_ii * Sigk[-i, i]
                Sigk_pi <- 0.5 * (Sigk[-i, -i] - 1 / Sigk_ii * Sigk[-i, i] %*% t(Sigk[-i, i]) +
                                  t(Sigk[-i, -i] - 1 / Sigk_ii * Sigk[-i, i] %*% t(Sigk[-i, i])))
                Sigk_dx_ii <- Sigk_dx[ , , i, i]
                Sigk_dx_ik <- Sigk_dx[ , , i, k]
                phi_ik <- dnorm(bk[i, 1], mk_i, sqrt(Sigk_ii))
                dphi_ik_dSig <- ((bk_i - mk_i)^2 / (2 * Sigk_ii^2) - 0.5 * 1 / Sigk_ii) * phi_ik
                dphi_ik_dm <- (bk_i - mk_i) / Sigk_ii * phi_ik
                if (is.null(envir)) {
                    Phi_ik <- pmnorm(x = ck_pi, mean = rep(0, q - 1), varcov = Sigk_pi,
                                     maxpts = (q - 1) * 200)
                } else {
                    Phi_ik <- symetric_term[k, i] / phi_ik
                }
                GPhi_ik <- GPhi(x = matrix(ck_pi, q - 1, 1), mu = matrix(0, q - 1, 1),
                                Sigma = Sigk_pi)
                HPhi_ik <- HPhi(x = matrix(ck_pi, q - 1, 1), mu = matrix(0, q - 1, 1),
                                Sigma = Sigk_pi, gradient = GPhi_ik)
                Sigk_mi <- Sigk[-i, i]
                for (l in 1:q) {
                    for (j in 1:d) {
                        ## B1
                        B1[l, j] <- B1[l, j] + Sigk_dx_ik[l, j] * phi_ik * Phi_ik
                        ## B2
                        B2[l, j] <- B2[l, j] + Sigk_ik *
                            (mk_dx_i[l,j] * dphi_ik_dm +
                             dphi_ik_dSig * Sigk_dx_ii[l, j]) * Phi_ik
                        ## B3
                        dck_pi <- -mk_dx[l, j, -i] +
                            (mk_dx_i[l, j] * Sigk_ii + (bk_i - mk_i) * Sigk_dx_ii[l, j]) /
                            Sigk_ii^2 * Sigk_mi -
                            (bk[i, 1] - mk_i) / Sigk_ii * Sigk_dx[l, j, -i, i]
                        SigtCross <- tcrossprod(Sigk_dx[l, j, -i, i], Sigk_mi)
                        dSigk_pi <- Sigk_dx[l, j, -i, -i] + Sigk_dx_ii[l, j] / Sigk_ii^2 *
                            tcrossprod(Sigk_mi, Sigk_mi) - Sigk_ii^-1 *
                            (SigtCross + t(SigtCross))
                        B3[l, j] <- B3[l, j] +
                            Sigk_ik * phi_ik * (crossprod(GPhi_ik, dck_pi) +
                                                0.5 * sum(HPhi_ik * dSigk_pi))
                    }
                }
            }
        }
        if (fastCompute == TRUE) {
            EI.grad <- EI.grad + termB
        } else {
            EI.grad <- EI.grad + B1 + B2 + B3
        }
    }
    if (is.nan(sum(EI.grad))) {
        return(matrix(0, q, d))
    }
    return (EI.grad)
}