R/hetGP.R

Defines functions rho_AN auto_bounds scores logLikH find_reps rebuild.hetGP plot.hetGP print.hetGP print.summary.hetGP summary.hetGP predict.hetGP mleHetGP compareGP dlogLikHet logLikHet rebuild.homGP predict.homGP plot.homGP print.homGP print.summary.homGP summary.homGP mleHomGP dlogLikHom logLikHom

Documented in compareGP find_reps logLikH mleHetGP mleHomGP predict.hetGP predict.homGP rebuild.hetGP rebuild.homGP scores

################################################################################
### Contains:
### - log-likelihood computations using simplified expression in case of replicates
### - maximization of log-likelihood functions
### - prediction given hyperparameters
################################################################################

################################################################################
## Part I: Homoskedastic noise
################################################################################

## Model: noisy observations with unknown homoskedastic noise
## K = nu^2 * (C + g * I)
# X0 unique designs matrix
# Z0 averaged observations at X0
# Z observations vector (all observations)
# mult number of replicates at each unique design
# theta vector of lengthscale hyperparameters (or one for isotropy)
# g noise variance for the process
# beta0 trend
# @return loglikelihood value
logLikHom <- function(X0, Z0, Z, mult, theta, g, beta0 = NULL, covtype = "Gaussian", eps = sqrt(.Machine$double.eps), env = NULL){
  n <- nrow(X0)
  N <- length(Z)
  
  # Temporarily store Cholesky transform of K in Ki
  C <- cov_gen(X1 = X0, theta = theta, type = covtype)
  if(!is.null(env)) env$C <- C
  Ki <- chol(C + diag(eps + g / mult))
  ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
  Ki <- chol2inv(Ki)
  
  if(!is.null(env)) env$Ki <- Ki
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- 1/N * ((crossprod(Z - beta0) - crossprod((Z0 - beta0) * mult, Z0 - beta0))/g + psi_0)
  
  loglik <- -N/2 * log(2*pi) - N/2 * log(psi) + 1/2 * ldetKi - (N - n)/2 * log(g) - 1/2 * sum(log(mult)) - N/2
}

# derivative of log-likelihood for logLikHom with respect to theta with all observations (Gaussian kernel)
## Model: noisy observations with unknown homoskedastic noise
## K = nu^2 * (C + g * I)
# X0  design matrix (no replicates)
# Z0 averaged observations
# Z observations vector (all observations)
# mult number of replicates per unique design point
# theta vector of lengthscale hyperparameters (or one for isotropy)
# g noise variance for the process
# beta0 trend
## @return gradient with respect to theta and g
dlogLikHom <- function(X0, Z0, Z, mult, theta, g, beta0 = NULL, covtype = "Gaussian",
                       eps = sqrt(.Machine$double.eps), components = c("theta", "g"), env = NULL){
  k <- length(Z)
  n <- nrow(X0)
  
  
  if(!is.null(env)){
    C <- env$C
    Ki <- env$Ki
  }else{
    C <- cov_gen(X1 = X0, theta = theta, type = covtype)
    Ki <- chol2inv(chol(C + diag(eps + g / mult)))
  }
  
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  Z0 <- Z0 - beta0
  Z <- Z - beta0
  
  KiZ0 <- Ki %*% Z0 ## to avoid recomputing  
  
  psi <- drop(crossprod(Z0, KiZ0))
  
  tmp1 <- tmp2 <- NULL
  
  # First component, derivative with respect to theta
  if("theta" %in% components){
    tmp1 <- rep(NA, length(theta))
    if(length(theta)==1){
      dC_dthetak <- partial_cov_gen(X1 = X0, theta = theta, type = covtype, arg = "theta_k") * C
      tmp1 <- k/2 * crossprod(KiZ0, dC_dthetak) %*% KiZ0 /((crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi) - 1/2 * trace_sym(Ki, dC_dthetak)
    }else{
      for(i in 1:length(theta)){
        dC_dthetak <- partial_cov_gen(X1 = X0[,i, drop = F], theta = theta[i], type = covtype, arg = "theta_k") * C
        tmp1[i] <- k/2 * crossprod(KiZ0, dC_dthetak) %*% KiZ0 /((crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi) - 1/2 * trace_sym(Ki, dC_dthetak)
      }
    } 
  }
  
  # Second component derivative with respect to g
  if("g" %in% components)
    tmp2 <- k/2 * ((crossprod(Z) - crossprod(Z0 * mult, Z0))/g^2 + sum(KiZ0^2/mult)) / ((crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi) - (k - n)/ (2*g) - 1/2 * sum(diag(Ki)/mult)
  
  return(c(tmp1, tmp2))
}

#' Gaussian process regression under homoskedastic noise based on maximum likelihood estimation of the 
#' hyperparameters. This function is enhanced to deal with replicated observations.
#' @title Gaussian process modeling with homoskedastic noise
#' @param X matrix of all designs, one per row, or list with elements:
#' \itemize{
#'   \item \code{X0} matrix of unique design locations, one point per row
#'   \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
#'   \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
#' } 
#' @param Z vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}
#' @param lower,upper optional bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
#' In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy) 
#' @param known optional list of known parameters, e.g., \code{beta0}, \code{theta} or \code{g}
#' @param covtype covariance kernel type, either 'Gaussian', 'Matern5_2' or 'Matern3_2', see \code{\link[hetGP]{cov_gen}}
#' @param noiseControl list with element , 
#' \itemize{
#' \item \code{g_bounds}, vector providing minimal and maximal noise to signal ratio
#' } 
#' @param init optional list specifying starting values for MLE optimization, with elements:
#' \itemize{
#'  \item \code{theta_init} initial value of the theta parameters to be optimized over (default to 10\% of the range determined with \code{lower} and \code{upper})
#'  \item \code{g_init} initial value of the nugget parameter to be optimized over (based on the variance at replicates if there are any, else \code{0.1})
#' }
#' @param maxit maximum number of iteration for L-BFGS-B of \code{\link[stats]{optim}}
#' @param eps jitter used in the inversion of the covariance matrix for numerical stability
#' @param settings list with argument \code{return.Ki}, to include the inverse covariance matrix in the object for further use (e.g., prediction).
#' Arguments \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}} (for the joint likelihood only). 
#' @return a list which is given the S3 class "\code{homGP}", with elements:
#' \itemize{
#' \item \code{theta}: maximum likelihood estimate of the lengthscale parameter(s),
#' \item \code{g}: maximum likelihood estimate of the nugget variance,
#' \item \code{trendtype}: either "\code{SK}" if \code{beta0} is given, else "\code{OK}" 
#' \item \code{beta0}: estimated trend unless given in input,
#' \item \code{nu_hat}: plugin estimator of the variance,
#' \item \code{ll}: log-likelihood value,
#' \item \code{X0}, \code{Z0}, \code{Z}, \code{mult}, \code{eps}, \code{covtype}: values given in input,
#' \item \code{call}: user call of the function
#' \item \code{used_args}: list with arguments provided in the call
#' \item \code{nit_opt}, \code{msg}: \code{counts} and \code{msg} returned by \code{\link[stats]{optim}}
#' \item \code{Ki}: inverse covariance matrix (not scaled by \code{nu_hat}) (if \code{return.Ki} is \code{TRUE} in \code{settings})
#' \item \code{time}: time to train the model, in seconds.
#' 
#'}
#' @details
#' The global covariance matrix of the model is parameterized as \code{nu_hat * (C + g * diag(1/mult)) = nu_hat * K},
#' with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices) and values of lengthscale parameters.
#' \code{nu_hat} is the plugin estimator of the variance of the process.
#' 
#' It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.
#' 
#' @seealso \code{\link[hetGP]{predict.homGP}} for predictions, \code{\link[hetGP]{update.homGP}} for updating an existing model. 
#' \code{summary} and \code{plot} functions are available as well. 
#' \code{\link[hetGP]{mleHomTP}} provide a Student-t equivalent.
#' @references 
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902. \cr \cr
#' @export
## ' @importFrom numDeriv hessian
#' @examples
#' ##------------------------------------------------------------
#' ## Example 1: Homoskedastic GP modeling on the motorcycle data
#' ##------------------------------------------------------------
#' set.seed(32)
#' 
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' 
#'  
#' model <- mleHomGP(X = X, Z = Z, lower = 0.01, upper = 100)
#'   
#' ## Display averaged observations
#' points(model$X0, model$Z0, pch = 20) 
#' xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
#' predictions <- predict(x = xgrid, object =  model)
#' 
#' ## Display mean prediction
#' lines(xgrid, predictions$mean, col = 'red', lwd = 2)
#' ## Display 95% confidence intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' ## Display 95% prediction intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)

mleHomGP <- function(X, Z, lower = NULL, upper = NULL, known = NULL,
                     noiseControl = list(g_bounds = c(sqrt(.Machine$double.eps), 1e2)),
                     init = NULL,
                     covtype = c("Gaussian", "Matern5_2", "Matern3_2"),
                     maxit = 100, eps = sqrt(.Machine$double.eps), settings = list(return.Ki = TRUE, factr = 1e7)){
  
  if(typeof(X)=="list"){
    X0 <- X$X0
    Z0 <- X$Z0
    mult <- X$mult
    if(sum(mult) != length(Z)) stop("Length(Z) should be equal to sum(mult)")
    if(is.null(dim(X0))) X0 <- matrix(X0, ncol = 1)
    if(length(Z0) != nrow(X0)) stop("Dimension mismatch between Z0 and X0")
  }else{
    if(is.null(dim(X))) X <- matrix(X, ncol = 1)
    if(nrow(X) != length(Z)) stop("Dimension mismatch between Z and X")
    elem <- find_reps(X, Z, return.Zlist = F)
    X0 <- elem$X0
    Z0 <- elem$Z0
    Z <- elem$Z
    mult <- elem$mult
  }
  
  covtype <- match.arg(covtype)
  
  if(is.null(lower) || is.null(upper)){
    auto_thetas <- auto_bounds(X = X0, covtype = covtype)
    if(is.null(lower)) lower <- auto_thetas$lower
    if(is.null(upper)) upper <- auto_thetas$upper
    if(is.null(known[["theta"]]) && is.null(init$theta)) init$theta <- sqrt(upper * lower)
  }
  if(length(lower) != length(upper)) stop("upper and lower should have the same size")
  
  ## Save time to train model
  tic <- proc.time()[3]
  
  if(is.null(settings$return.Ki)) settings$return.Ki <- TRUE
  if(is.null(noiseControl$g_bounds)) noiseControl$g_bounds <- c(sqrt(.Machine$double.eps), 1e2)
  
  g_min <- noiseControl$g_bounds[1]
  g_max <- noiseControl$g_bounds[2]
  
  beta0 <- known$beta0
  
  N <- length(Z)
  n <- nrow(X0)
  
  if(is.null(n))
    stop("X0 should be a matrix. \n")
  
  if(is.null(known[["theta"]]) && is.null(init$theta)) init$theta <- 0.9 * lower + 0.1 * upper # useful for mleHetGP
  if(is.null(known$g) && is.null(init$g)){
    if(any(mult > 2)) init$g <- mean((fast_tUY2(mult, (Z - rep(Z0, times = mult))^2)/mult)[which(mult > 2)])/var(Z0) else init$g <- 0.1
  }
  
  trendtype <- 'OK'
  if(!is.null(beta0))
    trendtype <- 'SK'
  
  ## General definition of fn and gr
  fn <- function(par, X0, Z0, Z, mult, beta0, theta, g, env){
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
    }
    
    if(is.null(g)){
      g <- par[idx]
    }
    
    loglik <- logLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, beta0 = beta0, covtype = covtype, eps = eps, env = env)
    
    if(!is.null(env) && !is.na(loglik)){
      if(is.null(env$max_loglik) || loglik > env$max_loglik){
        env$max_loglik <- loglik
        env$arg_max <- par
      }
    } 
    
    return(loglik)
  }
  
  gr <- function(par, X0, Z0, Z, mult, beta0, theta, g, env){
    idx <- 1
    components <- NULL
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
      components <- "theta"
    }
    
    if(is.null(g)){
      g <- par[idx]
      components <- c(components, "g")
    }
    return(dlogLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, beta0 = beta0, covtype = covtype, eps = eps,
                      components = components, env = env))
  }
  
  ## Both known
  envtmp <- environment()
  if(!is.null(known$g) && !is.null(known[["theta"]])){
    theta_out <- known[["theta"]]
    g_out <- known$g
    out <- list(value = logLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta_out, g = g_out, beta0 = beta0, covtype = covtype, eps = eps),
                message = "All hyperparameters given", counts = 0, time = proc.time()[3] - tic)
  }else{
    parinit <- lowerOpt <- upperOpt <- NULL
    if(is.null(known[["theta"]])){
      parinit <- init$theta
      lowerOpt <- c(lower)
      upperOpt <- c(upper)
    }
    if(is.null(known$g)){
      parinit <- c(parinit, init$g)
      lowerOpt <- c(lowerOpt, g_min)
      upperOpt <- c(upperOpt, g_max)
    }
    
    out <- try(optim(par = parinit, fn = fn, gr = gr, method = "L-BFGS-B", lower = lowerOpt, upper = upperOpt, theta = known[["theta"]], g = known$g,
                     X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0,
                     control = list(fnscale = -1, maxit = maxit, factr = settings$factr, pgtol = settings$pgtol), env = envtmp))
    ## Catch errors when at least one likelihood evaluation worked
    if(is(out, "try-error"))
      out <- list(par = envtmp$arg_max, value = envtmp$max_loglik, counts = NA,
                  message = "Optimization stopped due to NAs, use best value so far")
    
    if(is.null(known$g)) g_out <- out$par[length(out$par)] else g_out <- known$g
    if(is.null(known[["theta"]])) theta_out <- out$par[1:length(init$theta)] else theta_out <- known[["theta"]]
    
  }
  
  Ki <- chol2inv(chol(add_diag(cov_gen(X1 = X0, theta = theta_out, type = covtype), eps + g_out/ mult)))
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  nu <- 1/N * ((crossprod(Z - beta0) - crossprod((Z0 - beta0) * mult, Z0 - beta0))/g_out + psi_0)
  
  # # Get hessian of our cost function/
  # if (is.null(known[["theta"]])) {
  #   # Jacobian is more precise numerically but doesn't seem to work for some reason
  #   #hess <- jacobian(func = gr, x = out$par, theta = known[["theta"]], g = known$g,
  #   #             X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0, env = envtmp)
  #   fwrap <- function(par, ...) fn(c(par, out$par[length(out$par)]), ...)
  #   hess <- hessian(func = fwrap, x = out$par[1:(length(out$par)-1)], theta = known[["theta"]], g = known$g,
  #                   X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0, env = NULL)
  # } else {
  #   hess <- NULL
  # }
  
  res <- list(theta = theta_out, g = g_out, nu_hat = as.numeric(nu), ll = out$value, nit_opt = out$counts,
              beta0 = beta0, trendtype = trendtype, covtype = covtype, msg = out$message, eps = eps,
              X0 = X0, Z0 = Z0, Z = Z, mult = mult, call = match.call(),
              used_args = list(lower = lower, upper = upper, known = known, noiseControl = noiseControl),
              time = proc.time()[3] - tic) # hess = hess)
  
  if(settings$return.Ki) res <- c(res, list(Ki = Ki))
  
  class(res) <- "homGP"
  return(res)
}

#' @method summary homGP
#' @export
summary.homGP <- function(object,...){
  ans <- object
  class(ans) <- "summary.homGP"
  ans
}

#' @export
print.summary.homGP <- function(x, ...){
  
  cat("N = ", length(x$Z), " n = ", length(x$Z0), " d = ", ncol(x$X0), "\n")
  cat(x$covtype, " covariance lengthscale values: ", x$theta, "\n")
  
  cat("Homoskedastic nugget value: ", x$g, "\n")
  
  cat("Variance/scale hyperparameter: ", x$nu_hat, "\n")
  
  if(x$trendtype == "SK"){
    cat("Given constant trend value: ", x$beta0, "\n")
  }else{
    cat("Estimated constant trend value: ", x$beta0, "\n")
  }
  
  cat("MLE optimization: \n", "Log-likelihood = ", x$ll, "; Nb of evaluations (obj, gradient) by L-BFGS-B: ", x$nit_opt, "; message: ", x$msg, "\n")
}

#' @method print homGP
#' @export
print.homGP <- function(x, ...){
  print(summary(x))
  
}

#' @method plot homGP
#' @export
#' @importFrom graphics abline legend plot points arrows
#' @importFrom stats qnorm
plot.homGP <- function(x, ...){
  LOOpreds <- LOO_preds(x)
  plot(x$Z, LOOpreds$mean[rep(1:nrow(x$X0), times = x$mult)], xlab = "Observed values", ylab = "Predicted values",
       main = "Leave-one-out predictions")
  arrows(x0 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.05),
         x1 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.95),
         y0 = LOOpreds$mean, length = 0, col = "blue")
  points(x$Z0[which(x$mult > 1)], LOOpreds$mean[which(x$mult > 1)], pch = 20, col = 2)
  abline(a = 0, b = 1, lty = 3)
  legend("topleft", pch = c(1, 20, NA), lty = c(NA, NA, 1), col = c(1, 2, 4),
         legend = c("observations", "averages (if > 1 observation)", "LOO prediction interval"))
}

#' Gaussian process predictions using a homoskedastic noise GP object (of class \code{homGP})
#' @param x matrix of designs locations to predict at (one point per row)
#' @param object an object of class \code{homGP}; e.g., as returned by \code{\link[hetGP]{mleHomGP}}
#' @param xprime optional second matrix of predictive locations to obtain the predictive covariance matrix between \code{x} and \code{xprime}
#' @param ... no other argument for this method
#' @return list with elements
#' \itemize{
#' \item \code{mean}: kriging mean;
#' \item \code{sd2}: kriging variance (filtered, e.g. without the nugget value)
#' \item \code{cov}: predictive covariance matrix between \code{x} and \code{xprime}
#' \item \code{nugs}: nugget value at each prediction location, for consistency with \code{\link[hetGP]{mleHomGP}}.
#' }
#' @importFrom MASS ginv
#' @details The full predictive variance corresponds to the sum of \code{sd2} and \code{nugs}. See \code{\link[hetGP]{mleHomGP}} for examples.
#' @method predict homGP
#' @export
predict.homGP <- function(object, x, xprime = NULL, ...){
  if(is.null(dim(x))){
    x <- matrix(x, nrow = 1)
    if(ncol(x) != ncol(object$X0)) stop("x is not a matrix")
  }
  
  if(!is.null(xprime) && is.null(dim(xprime))){
    xprime <- matrix(xprime, nrow = 1)
    if(ncol(xprime) != ncol(object$X0)) stop("xprime is not a matrix")
  }
  
  if(is.null(object$Ki))
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps)))
  
  object$Ki <- object$Ki/object$nu_hat
  
  
  kx <- object$nu_hat * cov_gen(X1 = x, X2 = object$X0, theta = object$theta, type = object$covtype)
  
  nugs <- rep(object$nu_hat * object$g, nrow(x))
  mean <- as.vector(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0)))
  
  if(object$trendtype == 'SK'){
    sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)))
  }else{
    sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
  }
  
  ## In case of numerical errors, some sd2 values may become negative
  if(any(sd2 < 0)){
    # object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps))/object$nu_hat
    # mean <- as.vector(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0)))
    # 
    # if(object$trendtype == 'SK'){
    #   sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)))
    # }else{
    #   sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
    # }
    sd2 <- pmax(0, sd2)
    warning("Numerical errors caused some negative predictive variances to be thresholded to zero. Consider using ginv via rebuild.homGP")
  }
  
  if(!is.null(xprime)){
    kxprime <- object$nu_hat * cov_gen(X1 = object$X0, X2 = xprime, theta = object$theta, type = object$covtype)
    if(object$trendtype == 'SK'){
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime 
      }else{
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*% kxprime)
      }
    }else{
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki)
      }else{
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*%  kxprime) + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki)
      }
    }
  }else{
    cov = NULL
  }
  
  return(list(mean = mean, sd2 = sd2, nugs = nugs, cov = cov))
}

if(!isGeneric("rebuild")) {
  setGeneric(name = "rebuild",
             def = function(object, ...) standardGeneric("rebuild")
  )
}

#' Functions to make \code{hetGP} objects lighter before exporting them, and to reverse this after import.
#' The \code{rebuild} function may also be used to obtain more robust inverse of covariance matrices using \code{\link[MASS]{ginv}}.
#' @title Import and export of hetGP objects
#' @param object \code{homGP} or \code{homTP} model without slot \code{Ki} (inverse covariance matrix),
#'  or \code{hetGP} or \code{hetTP} model without slot \code{Ki} or \code{Kgi}
#' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
## ' @param ... not used
#' @export
#' @return \code{object} with additional or removed slots.
#' @rdname ExpImp
#' @examples 
#' set.seed(32)
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' ## Model fitting
#' model <- mleHetGP(X = X, Z = Z, lower = 0.1, upper = 50)
#' 
#' # Check size
#' object.size(model)
#' 
#' # Remove internal elements, e.g., to save it
#' model <- strip(model)
#' 
#' # Check new size
#' object.size(model)
#' 
#' # Now rebuild model, and use ginv instead
#' model <- rebuild(model, robust = TRUE)
#' object.size(model)
#' 
rebuild <- function (object, robust) {
  UseMethod("rebuild", object)
}

## ' Rebuild inverse covariance matrix of \code{homGP} (e.g., if exported without \code{Ki})
## ' @param object \code{homGP} model without slot \code{Ki} (inverse covariance matrix)
## ' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
#' @method rebuild homGP
#' @rdname ExpImp
#' @export
rebuild.homGP <- function(object, robust = FALSE){
  if(robust){
    object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps))/object$nu_hat
  }else{
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps)))
  }
  
  return(object)
}

#' @export
#' @rdname ExpImp
strip <- function (object) {
  # UseMethod("strip", object)
  if(!is.null(object$Ki)) object$Ki <- NULL
  if(!is.null(object$Kgi)) object$Kgi <- NULL
  if(!is.null(object$modHom)) object$modHom <- NULL
  if(!is.null(object$modNugs)) object$modNugs <- NULL
  return(object)
}


###############################################################################
## Part II: Heterogeneous GP with all options for the fit
###############################################################################

## ' log-likelihood in the anisotropic case - one lengthscale by variable
## ' Model: K = nu2 * (C + Lambda) = nu using all observations using the replicates information
## ' nu2 is replaced by its plugin estimator in the likelihood
## ' @param X0 unique designs
## ' @param Z0 averaged observations
## ' @param Z replicated observations (sorted with respect to X0)
## ' @param mult number of replicates at each Xi
## ' @param Delta vector of nuggets corresponding to each X0i or pXi, that are smoothed to give Lambda
## ' @param logN should exponentiated variance be used
## ' @param SiNK should the smoothing come from the SiNK predictor instead of the kriging one
## ' @param theta scale parameter for the mean process, either one value (isotropic) or a vector (anistropic)
## ' @param k_theta_g constant used for linking nuggets lengthscale to mean process lengthscale, i.e., theta_g[k] = k_theta_g * theta[k], alternatively theta_g can be used
## ' @param theta_g either one value (isotropic) or a vector (anistropic), alternative to using k_theta_g
## ' @param g nugget of the nugget process
## ' @param pX matrix of pseudo inputs locations of the noise process for Delta (could be replaced by a vector to avoid double loop)
## ' @param beta0 mean, if not provided, the MLE estimator is used
## ' @param eps minimal value of elements of Lambda
## ' @param covtype covariance kernel type
## ' @param penalty should a penalty term on Delta be used?
## ' @param hom_ll reference homoskedastic likelihood
## ' @export
logLikHet <- function(X0, Z0, Z, mult, Delta, theta, g, k_theta_g = NULL, theta_g = NULL, logN = FALSE, SiNK = FALSE,
                      beta0 = NULL, pX = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian", SiNK_eps = 1e-4,
                      penalty = T, hom_ll = NULL, env = NULL, trace = 0){
  n <- nrow(X0)
  N <- length(Z)
  
  if(is.null(theta_g))
    theta_g <- k_theta_g * theta
  
  if(is.null(pX)){
    Cg <- cov_gen(X1 = X0, theta = theta_g, type = covtype)
    Kg_c <- chol(Cg + diag(eps + g/mult))
    Kgi <- chol2inv(Kg_c)
    
    nmean <- drop(rowSums(Kgi) %*% Delta / sum(Kgi)) ## ordinary kriging mean
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = X0, theta_g = theta_g, g = g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * Cg %*% (Kgi %*% (Delta - nmean))
    }else{
      M <- Cg %*% (Kgi %*% (Delta - nmean))
    }
    
  }else{
    Cg <- cov_gen(X1 = pX, theta = theta_g, type = covtype)
    Kg_c <- chol(Cg + diag(eps + g/mult))
    Kgi <- chol2inv(Kg_c)
    
    kg <- cov_gen(X1 = X0, X2 = pX, theta = theta_g, type = covtype)
    
    nmean <- drop(rowSums(Kgi) %*% Delta / sum(Kgi)) ## ordinary kriging mean
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = pX, theta_g = theta_g, g = g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * kg %*% (Kgi %*% (Delta - nmean))
    }else{
      M <- kg %*% (Kgi %*% (Delta - nmean))
    }
  }
  
  Lambda <- drop(nmean + M)
  
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  
  LambdaN <- rep(Lambda, times = mult)
  
  # Temporarily store Cholesky transform of K in Ki
  C <- cov_gen(X1 = X0, theta = theta, type = covtype)
  if(!is.null(env)) env$C <- C
  Ki <- chol(C + diag(Lambda/mult + eps))
  ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
  Ki <- chol2inv(Ki)
  
  if(!is.null(env)){
    env$Cg <- Cg
    env$Kg_c <- Kg_c
    env$Kgi <- Kgi
    env$ldetKi <- ldetKi
    env$Ki <- Ki
  }
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- 1/N * (crossprod((Z - beta0)/LambdaN, Z - beta0) - crossprod((Z0 - beta0) * mult/Lambda, Z0 - beta0) + psi_0)
  
  loglik <- -N/2 * log(2*pi) - N/2 * log(psi) + 1/2 * ldetKi - 1/2 * sum((mult - 1) * log(Lambda) + log(mult)) - N/2
  
  if(penalty){
    nu_hat_var <- drop(crossprod(Delta - nmean, Kgi) %*% (Delta - nmean))/length(Delta)
    
    ## To avoid 0 variance, e.g., when Delta = nmean
    if(nu_hat_var < eps){
      return(loglik)
    } 
    
    # if(hardpenalty)
    #   return(loglik + min(0, - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2))
    
    pen <- - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2
    
    if(loglik < hom_ll && pen > 0){
      if(trace > 0) warning("Penalty is desactivated when unpenalized likelihood is lower than its homGP equivalent")
      return(loglik)
    }
    
    
    return(loglik + pen)
  }
  return(loglik)
}




## ' derivative of log-likelihood for logLikHet_Wood with respect to theta and Lambda with all observations
## ' Model: K = nu2 * (C + Lambda) = nu using all observations using the replicates information
## ' nu2 is replaced by its plugin estimator in the likelihood
## ' @param X0 unique designs
## ' @param Z0 averaged observations
## ' @param Z replicated observations (sorted with respect to X0)
## ' @param mult number of replicates at each Xi
## ' @param Delta vector of nuggets corresponding to each X0i or pXi, that are smoothed to give Lambda
## ' @param logN should exponentiated variance be used
## ' @param SiNK should the smoothing come from the SiNK predictor instead of the kriging one
## ' @param theta scale parameter for the mean process, either one value (isotropic) or a vector (anistropic)
## ' @param k_theta_g constant used for linking nuggets lengthscale to mean process lengthscale, i.e., theta_g[k] = k_theta_g * theta[k], alternatively theta_g can be used
## ' @param theta_g either one value (isotropic) or a vector (anistropic), alternative to using k_theta_g
## ' @param g nugget of the nugget process
## ' @param pX matrix of pseudo inputs locations of the noise process for Delta (could be replaced by a vector to avoid double loop)
## ' @param components to determine which variable are to be taken in the derivation:
## ' NULL for all, otherwise list with elements from 'theta', 'Delta', 'theta_g', 'k_theta_g', 'pX' and 'g'.
## ' @param beta0 mean, if not provided, the MLE estimator is used
## ' @param eps minimal value of elements of Lambda
## ' @param covtype covariance kernel type
## ' @param penalty should a penalty term on Delta be used?
## ' @param hom_ll: reference homoskedastic likelihood
## ' @export
dlogLikHet <- function(X0, Z0, Z, mult, Delta, theta, g, k_theta_g = NULL, theta_g = NULL, beta0 = NULL, pX = NULL,
                       logN = TRUE, SiNK = FALSE, components = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian", SiNK_eps = 1e-4,
                       penalty = T, hom_ll, env = NULL){
  
  ## Verifications
  if(is.null(k_theta_g) && is.null(theta_g))
    cat("Either k_theta_g or theta_g must be provided \n")
  
  ## Initialisations
  
  # Which terms need to be computed
  if(is.null(components)){
    components <- c(list("theta", "Delta", "g"))
    if(is.null(k_theta_g)){
      components <- c(components, list("theta_g"))
    }else{
      components <- c(components, list("k_theta_g"))
    }
    if(!is.null(pX))
      components <- c(components, list("pX"))
  }
  
  if(is.null(theta_g))
    theta_g <- k_theta_g * theta
  
  n <- nrow(X0)
  N <- length(Z)
  
  if(!is.null(env)){
    Cg <- env$Cg
    Kg_c <- env$Kg_c
    Kgi <- env$Kgi
    if(is.null(pX)) M <- add_diag(Kgi * (-eps - g / mult), rep(1, n))
  }else{
    if(is.null(pX)){
      Cg <- cov_gen(X1 = X0, theta = theta_g, type = covtype)
      Kg_c <- chol(Cg + diag(eps + g/mult))
      Kgi <- chol2inv(Kg_c)
      # M <- Cg %*% Kgi
      M <- add_diag(Kgi * (-eps - g / mult), rep(1, n))
    }else{
      Cg <- cov_gen(X1 = pX, theta = theta_g, type = covtype)
      Kg_c <- chol(Cg + diag(eps + g/mult))
      Kgi <- chol2inv(Kg_c)
      kg <- cov_gen(X1 = X0, X2 = pX, theta = theta_g, type = covtype)
    }
  }
  
  ## Precomputations for reuse
  
  rSKgi <- rowSums(Kgi)
  sKgi <- sum(Kgi)
  
  nmean <- drop(rSKgi %*% Delta / sKgi) ## ordinary kriging mean
  
  ## Precomputations for reuse
  KgiD <- Kgi %*% (Delta - nmean)
  
  if(SiNK){
    rhox <- 1 / rho_AN(xx = X0, X0 = pX, theta_g = theta_g, g = g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
    M <-  rhox * M
  }
  
  Lambda <- drop(nmean + M %*% (Delta - nmean))
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  # Lambda <- pmax(mult * eps, Lambda)
  
  LambdaN <- rep(Lambda, times = mult)
  
  if(!is.null(env)){
    C <- env$C
    Ki <- env$Ki
    ldetKi <- env$ldetKi
  }else{
    C <- cov_gen(X1 = X0, theta = theta, type = covtype)
    Ki <- chol(C + diag(Lambda/mult + eps))
    ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
    Ki <- chol2inv(Ki)
  }
  
  # Ki <- chol2inv(chol(C + diag(Lambda/mult + eps)))
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  ## Precomputations for reuse
  KiZ0 <- Ki %*% (Z0 - beta0)
  rsM <- rowSums(M)
  
  psi_0 <- drop(crossprod(KiZ0, Z0 - beta0))
  
  psi <- drop((crossprod((Z - beta0)/LambdaN, Z - beta0) - crossprod((Z0 - beta0) * mult/Lambda, Z0 - beta0)) + psi_0)
  
  if(penalty){
    nu_hat_var <- drop(crossprod(KgiD, (Delta - nmean)))/length(Delta) 
    
    # To prevent numerical issues when Delta = nmean, resulting in divisions by zero
    if(nu_hat_var < eps){
      penalty <- FALSE
    }else{
      loglik <- -N/2 * log(2*pi) - N/2 * log(psi/N)  + 1/2 * ldetKi - 1/2 * sum((mult - 1) * log(Lambda) + log(mult)) - N/2
      pen <- - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2
      if(loglik < hom_ll && pen > 0) penalty <- FALSE
    }
    # if(nu_hat_var < eps || (hardpenalty && (- n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2) > 0)) penalty <- FALSE
  }       
  
  dLogL_dtheta <- dLogL_dDelta <- dLogL_dkthetag <- dLogL_dthetag <- dLogL_dg <- dLogL_dpX <- NULL
  
  # First component, derivative of logL with respect to theta
  if("theta" %in% components){
    dLogL_dtheta <- rep(NA, length(theta))
    for(i in 1:length(theta)){
      if(length(theta) == 1){
        dC_dthetak <- partial_cov_gen(X1 = X0, theta = theta, arg = "theta_k", type = covtype) * C # partial derivative of C with respect to theta
      }else{
        dC_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = theta[i], arg = "theta_k", type = covtype) * C # partial derivative of C with respect to theta
      }
      
      if("k_theta_g" %in% components){
        
        if(is.null(pX)){
          if(length(theta) == 1){
            dCg_dthetak <- partial_cov_gen(X1 = X0, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
          }else{
            dCg_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
          }
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == FALSE)
            dLdtk <- dCg_dthetak %*% KgiD - M %*% (dCg_dthetak %*% KgiD)
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == TRUE){
            Kgitkg <- t(M) # for reuse
            d_irho_dtheta_k <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dCg_dthetak, Kgitkg) - fast_diag(M, dCg_dthetak %*% Kgitkg) + fast_diag(M, t(dCg_dthetak)))
            dLdtk <- (d_irho_dtheta_k * kg + rhox * (dCg_dthetak - (M) %*% dCg_dthetak)) %*% KgiD
          }
        }else{
          if(length(theta == 1)){
            dCg_dthetak <- partial_cov_gen(X1 = pX, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
            dkg_dthetak <- partial_cov_gen(X1 = X0, X2 = pX, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * kg # partial derivative of kg with respect to theta[i]
          }else{
            dCg_dthetak <- partial_cov_gen(X1 = pX[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
            dkg_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], X2 = pX[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * kg # partial derivative of kg with respect to theta[i]
          }
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == FALSE)
            dLdtk <- dkg_dthetak %*% KgiD - M %*% (dCg_dthetak %*% KgiD)
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == TRUE){
            Kgitkg <- t(M) # for reuse
            d_irho_dtheta_k <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dthetak, Kgitkg) - fast_diag(M, dCg_dthetak %*% Kgitkg) + fast_diag(M, t(dkg_dthetak)))
            dLdtk <- (d_irho_dtheta_k * kg + rhox * (dkg_dthetak - (M) %*% dCg_dthetak)) %*% KgiD
          }
        }
        
        
        # (second part)
        dLdtk <- dLdtk - (1 - rsM) * drop(rSKgi %*% dCg_dthetak %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetak %*% rSKgi))/sKgi^2
        
        if(logN)
          dLdtk <- dLdtk * Lambda
        
        dK_dthetak <- add_diag(dC_dthetak, drop(dLdtk)/mult) # dK/dtheta[k]
        dLogL_dtheta[i] <- N/2 * (crossprod((Z - beta0)/LambdaN * rep(dLdtk, times = mult), (Z - beta0)/LambdaN) - crossprod((Z0 - beta0)/Lambda * mult * dLdtk, (Z0 - beta0)/Lambda) +
                                    crossprod(KiZ0, dK_dthetak) %*% KiZ0)/psi - 1/2 * trace_sym(Ki, dK_dthetak)
        dLogL_dtheta[i] <- dLogL_dtheta[i] - 1/2 * sum((mult - 1) * dLdtk/Lambda) # derivative of the sum(a_i - 1)log(lambda_i)
        
        if(penalty)
          dLogL_dtheta[i] <- dLogL_dtheta[i]  + 1/2 * crossprod(KgiD, dCg_dthetak) %*% KgiD / nu_hat_var  - 1/2 * trace_sym(Kgi, dCg_dthetak)
        
      }else{
        dLogL_dtheta[i] <- N/2 * crossprod(KiZ0, dC_dthetak) %*% KiZ0/psi - 1/2 * trace_sym(Ki, dC_dthetak)
      }
    }
  }
  
  # Derivative of logL with respect to Lambda
  if(any(c("Delta", "g", "k_theta_g", "theta_g", "pX") %in% components)){
    dLogLdLambda <- N/2 * ((fast_tUY2(mult, (Z - beta0)^2) - (Z0 - beta0)^2 * mult)/Lambda^2 + KiZ0^2/mult)/psi - (mult - 1)/(2*Lambda) - 1/(2*mult) * diag(Ki)
    
    if(logN){
      dLogLdLambda <- Lambda * dLogLdLambda
    }
  }
  
  
  ## Derivative of Lambda with respect to Delta
  if("Delta" %in% components)
    # dLogL_dDelta <- crossprod(M - tcrossprod(M, matrix(rep(rSKgi / sKgi, ncol(Kgi)), ncol(Kgi))) + matrix(rep(rSKgi / sKgi, nrow(M)), nrow(M), byrow = T), dLogLdLambda) #chain rule
    dLogL_dDelta <- crossprod(M, dLogLdLambda) + rSKgi/sKgi*sum(dLogLdLambda) -  rSKgi / sKgi * sum(crossprod(M, dLogLdLambda)) #chain rule
  
  # Derivative Lambda / k_theta_g
  if("k_theta_g" %in% components){
    if(is.null(pX)){
      dCg_dk <- partial_cov_gen(X1 = X0, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * Cg
      
      if(SiNK){
        d_irho_dkthetag <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dCg_dk, Kgitkg) - fast_diag(M, dCg_dk %*% Kgitkg) + fast_diag(M, t(dCg_dk)))
        dLogL_dkthetag <- (d_irho_dkthetag * kg + rhox*(dCg_dk - M %*% dCg_dk)) %*% KgiD -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }else{
        dLogL_dkthetag <- dCg_dk %*% KgiD - M %*% (dCg_dk %*% KgiD) -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }
    }else{
      dCg_dk <- partial_cov_gen(X1 = pX, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * Cg
      dkg_dk <- partial_cov_gen(X1 = X0, X2 = pX, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * kg
      
      if(SiNK){
        d_irho_dkthetag <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dk, Kgitkg) - fast_diag(M, dCg_dk %*% Kgitkg) + fast_diag(M, t(dkg_dk)))
        dLogL_dkthetag <- (d_irho_dkthetag * kg + rhox*(dkg_dk - M %*% dCg_dk)) %*% KgiD -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }else{
        dLogL_dkthetag <- dkg_dk %*% KgiD - M %*% (dCg_dk %*% KgiD) -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }
    }
    
    dLogL_dkthetag <- crossprod(dLogL_dkthetag, dLogLdLambda) ## chain rule
  }
  
  # Derivative Lambda / theta_g
  if("theta_g" %in% components){
    dLogL_dthetag <- rep(NA, length(theta_g))
    
    for(i in 1:length(theta_g)){
      if(is.null(pX)){
        
        if(length(theta_g) == 1){
          dCg_dthetagk <- partial_cov_gen(X1 = X0, theta = theta_g, arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
        }else{
          dCg_dthetagk <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
        }
        
        if(SiNK){
          d_irho_dtheta_gk <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dCg_dthetagk, Kgitkg) - fast_diag(M, dCg_dthetagk %*% Kgitkg) + fast_diag(M, t(dCg_dthetagk)))
          dLogL_dthetag[i] <- crossprod((d_irho_dtheta_gk * kg + rhox * (dCg_dthetagk - M %*% dCg_dthetagk)) %*% KgiD -
                                          (1 - rsM) * (rSKgi %*% dCg_dthetagk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }else{
          dLogL_dthetag[i] <- crossprod(dCg_dthetagk %*% KgiD - M %*% (dCg_dthetagk %*% KgiD) -
                                          (1 - rsM) * drop(rSKgi %*% dCg_dthetagk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }
      }else{
        if(length(theta_g) == 1){
          dCg_dthetagk <- partial_cov_gen(X1 = pX, theta = theta_g, arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
          dkg_dthetagk <- partial_cov_gen(X1 = X0, X2 = pX, theta = theta_g, arg = "theta_k", type = covtype) * kg # partial derivative of Cg with respect to theta
        }else{
          dCg_dthetagk <- partial_cov_gen(X1 = pX[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
          dkg_dthetagk <- partial_cov_gen(X1 = X0[, i, drop = FALSE], X2 = pX[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * kg # partial derivative of Cg with respect to theta
        }
        
        if(SiNK){
          d_irho_dtheta_gk <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dthetagk, Kgitkg) - fast_diag(M, dCg_dthetagk %*% Kgitkg) + fast_diag(M, t(dkg_dthetagk)))
          dLogL_dthetag[i] <- crossprod((d_irho_dtheta_gk * kg + rhox * (dkg_dthetagk - M %*% dCg_dthetagk)) %*% KgiD -
                                          (1 - rsM) * (rSKgi %*% dCg_dthetagk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }else{
          dLogL_dthetag[i] <- crossprod(dkg_dthetagk %*% KgiD - M %*% (dCg_dthetagk %*% KgiD) -
                                          (1 - rsM) * drop(rSKgi %*% dCg_dthetagk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }
      }
      
      # Penalty term
      if(penalty) dLogL_dthetag[i] <- dLogL_dthetag[i] + 1/2 * crossprod(KgiD, dCg_dthetagk) %*% KgiD/nu_hat_var - trace_sym(Kgi, dCg_dthetagk)/2 
    }
  }
  
  
  ## Derivative Lambda / g
  if("g" %in% components){
    if(SiNK){
      A0 <- diag(1/mult, n)
      d_irho_dg <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * fast_diag(-M %*% A0, Kgitkg)
      dLogL_dg <- crossprod((d_irho_dg * M - rhox * M %*% A0 %*% Kgi) %*% (Delta - nmean) - (1 - rsM) * (rowSums(Kgi %*% A0 %*% Kgi) %*% Delta * sKgi - rSKgi %*% Delta * sum(rSKgi^2/mult))/sKgi^2, dLogLdLambda) #chain rule
    }else{
      dLogL_dg <- crossprod(-M %*% (KgiD/mult) - (1 - rsM) * drop(Delta %*% (Kgi %*% (rSKgi/mult)) * sKgi - rSKgi %*% Delta * sum(rSKgi^2/mult))/sKgi^2, dLogLdLambda) #chain rule
    }
  }
  
  
  # Derivative Lambda/pX
  if("pX" %in% components){
    dLogL_dpX <- rep(NA, length(pX))
    for(i in 1:nrow(pX)){
      for(j in 1:ncol(pX)){
        dCg_dpX <- partial_cov_gen(X1 = pX, theta = theta_g, i1 = i, i2 = j, arg = "X_i_j", type = covtype) * Cg
        dkg_dpX <- t(partial_cov_gen(X1 = pX, X2 = X0, theta = theta_g, i1 = i, i2 = j, arg = "X_i_j", type = covtype)) * kg
        
        if(SiNK){
          d_irho_dX_i_j <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dpX, Kgitkg) - fast_diag(M, dCg_dpX %*% Kgitkg) + fast_diag(M, t(dkg_dpX)))
          dLogL_dpX[(j-1)*nrow(pX) + i] <- crossprod((d_irho_dX_i_j * kg + rhox * (dkg_dpX - M %*% dCg_dpX)) %*% KgiD -
                                                       (1 - rsM) * (rSKgi %*% dCg_dpX %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dpX %*% rSKgi))/sKgi^2, dLogLdLambda)
        }else{
          dLogL_dpX[(j-1)*nrow(pX) + i] <- crossprod((dkg_dpX - M %*% dCg_dpX) %*% KgiD -
                                                       (1 - rsM) * (rSKgi %*% dCg_dpX %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dpX %*% rSKgi))/sKgi^2, dLogLdLambda)
        }
        
        if(penalty) dLogL_dpX[(j-1)*nrow(pX) + i] <- dLogL_dpX[(j-1)*nrow(pX) + i] - 1/2 * crossprod(KgiD, dCg_dpX) %*% KgiD / nu_hat_var - trace_sym(Kgi, dCg_dpX)/2 
        
      }
    }
  }
  
  # Additional penalty terms on Delta
  if(penalty){
    if("Delta" %in% components){
      dLogL_dDelta <- dLogL_dDelta - KgiD / nu_hat_var
    }
    if("k_theta_g" %in% components){
      dLogL_dkthetag <- dLogL_dkthetag + 1/2 * crossprod(KgiD, dCg_dk) %*% KgiD / nu_hat_var - trace_sym(Kgi, dCg_dk)/2 
    }
    if("g" %in% components){
      dLogL_dg <- dLogL_dg + 1/2 * crossprod(KgiD/mult, KgiD) / nu_hat_var - sum(diag(Kgi)/mult)/2 
    }
    
  }
  
  return(c(dLogL_dtheta,
           dLogL_dDelta,
           dLogL_dkthetag,
           dLogL_dthetag,
           dLogL_dg,
           dLogL_dpX))
  
}


#' Compare two models based on the log-likelihood for \code{hetGP} and \code{homGP} models
#' @title Likelihood-based comparison of models
#' @param model1,model2 \code{hetGP} or \code{homGP} models
#' @return Best model based on the likelihood, first one in case of a tie
#' @note If comparing homoskedastic and heteroskedastic models, the un-penalised likelihood is used for the later, see e.g., (Binois et al. 2017+).
#' @export
#' @references
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902.
compareGP <- function(model1, model2){
  if(is(model1, "hetGP")) ll1 <- model1$ll_non_pen
  else ll1 <- model1$ll
  
  if(is(model2, "hetGP")) ll2 <- model2$ll_non_pen
  else ll2 <- model2$ll
  
  if(ll1 >= ll2) return(model1)
  return(model2)
}


#' @title Gaussian process modeling with heteroskedastic noise
#' @description 
#' Gaussian process regression under input dependent noise based on maximum likelihood estimation of the hyperparameters. 
#' A second GP is used to model latent (log-) variances. 
#' This function is enhanced to deal with replicated observations.
#' @param X matrix of all designs, one per row, or list with elements:
#' \itemize{
#'   \item \code{X0} matrix of unique design locations, one point per row
#'   \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
#'   \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
#' } 
#' @param Z vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}
#' @param lower,upper optional bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
#' In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy)
#' @param noiseControl list with elements related to optimization of the noise process parameters:
#' \itemize{
#' \item \code{g_min}, \code{g_max} minimal and maximal noise to signal ratio (of the mean process)
#' \item \code{lowerDelta}, \code{upperDelta} optional vectors (or scalars) of bounds on \code{Delta}, of length \code{nrow(X0)} (default to \code{rep(eps, nrow(X0))} and \code{rep(noiseControl$g_max, nrow(X0))} resp., or their \code{log}) 
## ' \item lowerpX, upperpX optional vectors of bounds of the input domain if pX is used.
#' \item \code{lowerTheta_g}, \code{upperTheta_g} optional vectors of bounds for the lengthscales of the noise process if \code{linkThetas == 'none'}.
#' Same as for \code{theta} if not provided.
#' \item \code{k_theta_g_bounds} if \code{linkThetas == 'joint'}, vector with minimal and maximal values for \code{k_theta_g} (default to \code{c(1, 100)}). See Details.
#' \item \code{g_bounds} vector for minimal and maximal noise to signal ratios for the noise of the noise process, i.e., the smoothing parameter for the noise process.
#' (default to \code{c(1e-6, 1)}).
#'}
#' @param settings list for options about the general modeling procedure, with elements:
#' \itemize{
#'   \item \code{linkThetas} defines the relation between lengthscales of the mean and noise processes.
#'   Either \code{'none'}, \code{'joint'}(default) or \code{'constr'}, see Details.
#'   \item \code{logN}, when \code{TRUE} (default), the log-noise process is modeled.
#'   \item \code{initStrategy} one of \code{'simple'}, \code{'residuals'} (default) and \code{'smoothed'} to obtain starting values for \code{Delta}, see Details
#'   \item \code{penalty} when \code{TRUE}, the penalized version of the likelihood is used (i.e., the sum of the log-likelihoods of the mean and variance processes, see References).
## '   \item \code{hardpenalty} is \code{TRUE}, the log-likelihood from the noise GP is taken into account only if negative (default if \code{maxit > 1000}).
#'   \item \code{checkHom} when \code{TRUE}, if the log-likelihood with a homoskedastic model is better, then return it.
#'   \item \code{trace} optional scalar (default to \code{0}). If positive, tracing information on the fitting process.
#' If \code{1}, information is given about the result of the heterogeneous model optimization.
#' Level \code{2} gives more details. Level {3} additionaly displays all details about initialization of hyperparameters.
#' \item \code{return.matrices} boolean to include the inverse covariance matrix in the object for further use (e.g., prediction).
#' \item \code{return.hom} boolean to include homoskedastic GP models used for initialization (i.e., \code{modHom} and \code{modNugs}).
#' \item \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}}.   
#' }
#' @param eps jitter used in the inversion of the covariance matrix for numerical stability
#' @param init,known optional lists of starting values for mle optimization or that should not be optimized over, respectively.
#' Values in \code{known} are not modified, while it can happen to these of \code{init}, see Details. 
#' One can set one or several of the following:
#' \itemize{
#' \item \code{theta} lengthscale parameter(s) for the mean process either one value (isotropic) or a vector (anistropic)
#' \item \code{Delta} vector of nuggets corresponding to each design in \code{X0}, that are smoothed to give \code{Lambda}
#' (as the global covariance matrix depend on \code{Delta} and \code{nu_hat}, it is recommended to also pass values for \code{theta})
#' \item \code{beta0} constant trend of the mean process
#' \item \code{k_theta_g} constant used for link mean and noise processes lengthscales, when \code{settings$linkThetas == 'joint'}
#' \item \code{theta_g} either one value (isotropic) or a vector (anistropic) for lengthscale parameter(s) of the noise process, when \code{settings$linkThetas != 'joint'}
#' \item \code{g} scalar nugget of the noise process
#' \item \code{g_H} scalar homoskedastic nugget for the initialisation with a \code{\link[hetGP]{mleHomGP}}. See Details.
## '\item pX matrix of fixed pseudo inputs locations of the noise process corresponding to Delta
#' }
#' @param covtype covariance kernel type, either \code{'Gaussian'}, \code{'Matern5_2'} or \code{'Matern3_2'}, see \code{\link[hetGP]{cov_gen}}
#' @param maxit maximum number of iterations for \code{L-BFGS-B} of \code{\link[stats]{optim}} dedicated to maximum likelihood optimization
#' 
#' @details
#' 
#' The global covariance matrix of the model is parameterized as \code{nu_hat * (C + Lambda * diag(1/mult)) = nu_hat * K},
#' with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices) and values of lengthscale parameters.
#' \code{nu_hat} is the plugin estimator of the variance of the process.
#' \code{Lambda} is the prediction on the noise level given by a second (homoskedastic) GP: \cr
#' \deqn{\Lambda = C_g(C_g + \mathrm{diag}(g/\mathrm{mult}))^{-1} \Delta} \cr
#' with \code{C_g} the correlation matrix between unique designs for this second GP, with lengthscales hyperparameters \code{theta_g} and nugget \code{g}
#' and \code{Delta} the variance level at \code{X0} that are estimated.
#' 
#' It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.
#' 
#' The noise process lengthscales can be set in several ways:
#' \itemize{
#' \item using \code{k_theta_g} (\code{settings$linkThetas == 'joint'}), supposed to be greater than one by default. 
#' In this case lengthscales of the noise process are multiples of those of the mean process.
#' \item if \code{settings$linkThetas == 'constr'}, then the lower bound on \code{theta_g} correspond to estimated values of an homoskedastic GP fit.
#' \item else lengthscales between the mean and noise process are independent (both either anisotropic or not).
#' }
#'
#' When no starting nor fixed parameter values are provided with \code{init} or \code{known}, 
#' the initialization process consists of fitting first an homoskedastic model of the data, called \code{modHom}.
#' Unless provided with \code{init$theta}, initial lengthscales are taken at 10\% of the range determined with \code{lower} and \code{upper},
#' while \code{init$g_H} may be use to pass an initial nugget value.
#' The resulting lengthscales provide initial values for \code{theta} (or update them if given in \code{init}). \cr \cr
#' If necessary, a second homoskedastic model, \code{modNugs}, is fitted to the empirical residual variance between the prediction
#'  given by \code{modHom} at \code{X0} and \code{Z} (up to \code{modHom$nu_hat}).
#' Note that when specifying \code{settings$linkThetas == 'joint'}, then this second homoskedastic model has fixed lengthscale parameters.
#' Starting values for \code{theta_g} and \code{g} are extracted from \code{modNugs}.\cr \cr
#' Finally, three initialization schemes for \code{Delta} are available with \code{settings$initStrategy}: 
#' \itemize{
#' \item for \code{settings$initStrategy == 'simple'}, \code{Delta} is simply initialized to the estimated \code{g} value of \code{modHom}. 
#' Note that this procedure may fail when \code{settings$penalty == TRUE}.
#' \item for \code{settings$initStrategy == 'residuals'}, \code{Delta} is initialized to the estimated residual variance from the homoskedastic mean prediction.
#' \item for \code{settings$initStrategy == 'smoothed'}, \code{Delta} takes the values predicted by \code{modNugs} at \code{X0}.
#' }
#'
#' Notice that \code{lower} and \code{upper} bounds cannot be equal for \code{\link[stats]{optim}}.
## ' To use pseudo-input locations for the noise process, one can either provide pX if they are not to be optimized.
## ' Otherwise, initial values are given with pXinit, and optimization bounds with lowerpX, upperpX in init.
## ' Automatic initialization of the other parameters without restriction is available for now only with method 'simple',
## ' otherwise it is assumed that pXinit points are a subset of X0.
#'
#' @return a list which is given the S3 class \code{"hetGP"}, with elements:
#' \itemize{
#' \item \code{theta}: unless given, maximum likelihood estimate (mle) of the lengthscale parameter(s),
#' \item \code{Delta}: unless given, mle of the nugget vector (non-smoothed),
#' \item \code{Lambda}: predicted input noise variance at \code{X0}, 
#' \item \code{nu_hat}: plugin estimator of the variance,
#' \item \code{theta_g}: unless given, mle of the lengthscale(s) of the noise/log-noise process,
#' \item \code{k_theta_g}: if \code{settings$linkThetas == 'joint'}, mle for the constant by which lengthscale parameters of \code{theta} are multiplied to get \code{theta_g},
#' \item \code{g}: unless given, mle of the nugget of the noise/log-noise process,
#' \item \code{trendtype}: either "\code{SK}" if \code{beta0} is provided, else "\code{OK}",
#' \item \code{beta0} constant trend of the mean process, plugin-estimator unless given,
#' \item \code{nmean}: plugin estimator for the constant noise/log-noise process mean,
## ' \item \code{pX}: if used, matrix of pseudo-inputs locations for the noise/log-noise process,
#' \item \code{ll}: log-likelihood value, (\code{ll_non_pen}) is the value without the penalty,
#' \item \code{nit_opt}, \code{msg}: \code{counts} and \code{message} returned by \code{\link[stats]{optim}}
#' \item \code{modHom}: homoskedastic GP model of class \code{homGP} used for initialization of the mean process,
#' \item \code{modNugs}: homoskedastic GP model of class \code{homGP} used for initialization of the noise/log-noise process,
#' \item \code{nu_hat_var}: variance of the noise process,
#' \item \code{used_args}: list with arguments provided in the call to the function, which is saved in \code{call},
#' \item \code{Ki}, \code{Kgi}: inverse of the covariance matrices of the mean and noise processes (not scaled by \code{nu_hat} and \code{nu_hat_var}),  
#' \item \code{X0}, \code{Z0}, \code{Z}, \code{eps}, \code{logN}, \code{covtype}: values given in input,
#' \item \code{time}: time to train the model, in seconds.
#'}
#' @seealso \code{\link[hetGP]{predict.hetGP}} for predictions, \code{\link[hetGP]{update.hetGP}} for updating an existing model.
#' \code{summary} and \code{plot} functions are available as well.
#' \code{\link[hetGP]{mleHetTP}} provide a Student-t equivalent.
#' @references 
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902. \cr \cr
#' @export
#' @importFrom stats optim var
#' @import methods
#' @examples 
#' ##------------------------------------------------------------
#' ## Example 1: Heteroskedastic GP modeling on the motorcycle data
#' ##------------------------------------------------------------
#' set.seed(32)
#' 
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' nvar <- 1
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' 
#' 
#' ## Model fitting
#' model <- mleHetGP(X = X, Z = Z, lower = rep(0.1, nvar), upper = rep(50, nvar),
#'                   covtype = "Matern5_2")
#'             
#' ## Display averaged observations
#' points(model$X0, model$Z0, pch = 20)
#' 
#' ## A quick view of the fit                  
#' summary(model)
#' 
#' ## Create a prediction grid and obtain predictions
#' xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
#' predictions <- predict(x = xgrid, object =  model)
#' 
#' ## Display mean predictive surface
#' lines(xgrid, predictions$mean, col = 'red', lwd = 2)
#' ## Display 95% confidence intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' ## Display 95% prediction intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)
#' 
#' ##------------------------------------------------------------
#' ## Example 2: 2D Heteroskedastic GP modeling
#' ##------------------------------------------------------------
#' set.seed(1)
#' nvar <- 2
#'   
#' ## Branin redefined in [0,1]^2
#' branin <- function(x){
#'   if(is.null(nrow(x)))
#'     x <- matrix(x, nrow = 1)
#'     x1 <- x[,1] * 15 - 5
#'     x2 <- x[,2] * 15
#'     (x2 - 5/(4 * pi^2) * (x1^2) + 5/pi * x1 - 6)^2 + 10 * (1 - 1/(8 * pi)) * cos(x1) + 10
#' }
#' 
#' ## Noise field via standard deviation
#' noiseFun <- function(x){
#'   if(is.null(nrow(x)))
#'     x <- matrix(x, nrow = 1)
#'   return(1/5*(3*(2 + 2*sin(x[,1]*pi)*cos(x[,2]*3*pi) + 5*rowSums(x^2))))
#' }
#' 
#' ## data generating function combining mean and noise fields
#' ftest <- function(x){
#'   return(branin(x) + rnorm(nrow(x), mean = 0, sd = noiseFun(x)))
#' }
#' 
#' ## Grid of predictive locations
#' ngrid <- 51
#' xgrid <- matrix(seq(0, 1, length.out = ngrid), ncol = 1) 
#' Xgrid <- as.matrix(expand.grid(xgrid, xgrid))
#' 
#' ## Unique (randomly chosen) design locations
#' n <- 50
#' Xu <- matrix(runif(n * 2), n)
#' 
#' ## Select replication sites randomly
#' X <- Xu[sample(1:n, 20*n, replace = TRUE),]
#' 
#' ## obtain training data response at design locations X
#' Z <- ftest(X)
#' 
#' ## Formating of data for model creation (find replicated observations) 
#' prdata <- find_reps(X, Z, rescale = FALSE, normalize = FALSE)
#'
#' ## Model fitting
#' model <- mleHetGP(X = list(X0 = prdata$X0, Z0 = prdata$Z0, mult = prdata$mult), Z = prdata$Z,
#'                   lower = rep(0.01, nvar), upper = rep(10, nvar),
#'                   covtype = "Matern5_2")
#'
#' ## a quick view into the data stored in the "hetGP"-class object
#' summary(model)                  
#'              
#' ## prediction from the fit on the grid     
#' predictions <- predict(x = Xgrid, object =  model)
#' 
#' ## Visualization of the predictive surface
#' par(mfrow = c(2, 2))
#' contour(x = xgrid,  y = xgrid, z = matrix(branin(Xgrid), ngrid), 
#'   main = "Branin function", nlevels = 20)
#' points(X, col = 'blue', pch = 20)
#' contour(x = xgrid,  y = xgrid, z = matrix(predictions$mean, ngrid), 
#'   main = "Predicted mean", nlevels = 20)
#' points(Xu, col = 'blue', pch = 20)
#' contour(x = xgrid,  y = xgrid, z = matrix(noiseFun(Xgrid), ngrid), 
#'   main = "Noise standard deviation function", nlevels = 20)
#' points(Xu, col = 'blue', pch = 20)
#' contour(x = xgrid,  y= xgrid, z = matrix(sqrt(predictions$nugs), ngrid), 
#'   main = "Predicted noise values", nlevels = 20)
#' points(Xu, col = 'blue', pch = 20)
#' par(mfrow = c(1, 1))
##
mleHetGP <- function(X, Z, lower = NULL, upper = NULL,
                     noiseControl = list(k_theta_g_bounds = c(1, 100), g_max = 1e2, g_bounds = c(1e-6, 1)),
                     settings = list(linkThetas = 'joint', logN = TRUE, initStrategy = 'residuals', checkHom = TRUE,
                                     penalty = TRUE, trace = 0, return.matrices = TRUE, return.hom = FALSE, factr = 1e9), 
                     covtype = c("Gaussian", "Matern5_2", "Matern3_2"), maxit = 100, known = NULL, init = NULL, eps = sqrt(.Machine$double.eps)){
  
  if(typeof(X)=="list"){
    X0 <- X$X0
    Z0 <- X$Z0
    mult <- X$mult
    if(sum(mult) != length(Z)) stop("Length(Z) should be equal to sum(mult)")
    if(is.null(dim(X0))) X0 <- matrix(X0, ncol = 1)
    if(length(Z0) != nrow(X0)) stop("Dimension mismatch between Z0 and X0")
  }else{
    if(is.null(dim(X))) X <- matrix(X, ncol = 1)
    if(nrow(X) != length(Z)) stop("Dimension mismatch between Z and X")
    elem <- find_reps(X, Z, return.Zlist = F)
    X0 <- elem$X0
    Z0 <- elem$Z0
    Z <- elem$Z
    mult <- elem$mult
  }
  
  covtype <- match.arg(covtype)
  
  if(is.null(lower) || is.null(upper)){
    auto_thetas <- auto_bounds(X = X0, covtype = covtype)
    if(is.null(lower)) lower <- auto_thetas$lower
    if(is.null(upper)) upper <- auto_thetas$upper
  }
  
  if(length(lower) != length(upper)) stop("upper and lower should have the same size")
  
  ## Save time to train model
  tic <- proc.time()[3]
  
  ## Initial checks
  
  n <- nrow(X0)
  
  if(is.null(n))
    stop("X0 should be a matrix. \n")
  
  jointThetas <- constrThetas <- FALSE
  if(!is.null(known$theta_g)) settings$linkThetas <- FALSE
  
  if(is.null(settings$linkThetas)){
    jointThetas <- TRUE
  }else{
    if(settings$linkThetas == 'joint')
      jointThetas <- TRUE
    if(settings$linkThetas == 'constr')
      constrThetas <- TRUE
  }
  
  logN <- TRUE
  if(!is.null(settings$logN)){
    logN <- settings$logN
  }
  
  if(is.null(settings$return.matrices))
    settings$return.matrices <- TRUE
  
  if(is.null(settings$return.hom))
    settings$return.hom <- FALSE
  
  if(jointThetas && is.null(noiseControl$k_theta_g_bounds))
    noiseControl$k_theta_g_bounds <- c(1, 100)
  
  if(is.null(settings$initStrategy))
    settings$initStrategy <- 'residuals'
  
  if(is.null(settings$factr))
    settings$factr <- 1e9
  
  penalty <- TRUE
  if(!is.null(settings$penalty))
    penalty <- settings$penalty
  
  # if(!is.null(settings$hardpenalty)){
  #   hardpenalty <- settings$hardpenalty
  # }else{
  #   if(maxit < 1000) hardpenalty <- FALSE else hardpenalty <- TRUE
  # }
  
  
  if(is.null(settings$checkHom))
    settings$checkHom <- TRUE
  
  trace <- 0
  if(!is.null(settings$trace))
    trace <- settings$trace
  
  components <- NULL
  if(is.null(known[["theta"]])){
    components <- c(components, list("theta"))
  }else{
    init$theta <- known[["theta"]]
  }
  
  if(is.null(known$Delta)){
    components <- c(components, list("Delta"))
  }else{
    init$Delta <- known$Delta
  }
  
  if(jointThetas){
    if(is.null(known$k_theta_g)){
      components <- c(components, list("k_theta_g"))
    }else{
      init$k_theta_g <- known$k_theta_g
    }
  }
  
  if(!jointThetas && is.null(known$theta_g)){
    components <- c(components, list("theta_g"))
  }else{
    if(!jointThetas) init$theta_g <- known$theta_g
  }
  
  if(is.null(known$g)){
    components <- c(components, list("g"))
  }else{
    init$g <- known$g
  }
  
  if(!is.null(init$pX)){
    components <- c(components, list("pX"))
    idcs_pX <- which(duplicated(rbind(init$pX, X0))) - nrow(init$pX) ## Indices of starting points in pX
  }else{
    if(!is.null(known$pX)){
      init$pX <- known$pX
    }else{
      init$pX <- NULL
    }
  }
  
  if(penalty && "pX" %in% components){
    penalty <- FALSE
    warning("Penalty not available with pseudo-inputs for now")
  }
  
  trendtype <- 'OK'
  
  if(!is.null(known$beta0)){
    trendtype <- 'SK'
    # beta0 <- known$beta0
  }
  
  if(is.null(noiseControl$g_bounds)){
    noiseControl$g_bounds <- c(1e-6, 1)
  }
  
  if(is.null(components) && is.null(known$theta_g)){
    known$theta_g <- known$k_theta_g * known$theta
  }
  
  ## Advanced option Single Nugget Kriging model for the noise process
  if(!is.null(noiseControl$SiNK) && noiseControl$SiNK){
    SiNK <- TRUE
    if(is.null(noiseControl$SiNK_eps)){
      SiNK_eps <- 1e-4
    }else{
      SiNK_eps <- noiseControl$SiNK_eps
    }
  }else{
    SiNK <- FALSE
  } 
  
  
  ### Automatic Initialisation
  modHom <- modNugs <- NULL
  if(is.null(init[["theta"]]) || is.null(init$Delta)){
    ## A) homoskedastic mean process
    
    if(!is.null(known$g_H)){
      g_init <- NULL
    }else{
      g_init <- init$g_H
      ## Initial value for g of the homoskedastic process: based on the mean variance at replicates compared to the variance of Z0
      if(any(mult > 5)){
        mean_var_replicates <- mean((fast_tUY2(mult, (Z - rep(Z0, times = mult))^2)/mult)[which(mult > 5)])
        
        if(is.null(g_init)) g_init <- mean_var_replicates/var(Z0) 
        if(is.null(noiseControl$g_max))
          noiseControl$g_max <- max(1e2, 100 * g_init)
        if(is.null(noiseControl$g_min))
          noiseControl$g_min <- eps
        
      }else{
        if(is.null(g_init)) g_init <- 0.1
        
        if(is.null(noiseControl$g_max))
          noiseControl$g_max <- 1e2
        
        if(is.null(noiseControl$g_min))
          noiseControl$g_min <- eps
        
      }
      
    }
    
    # if(is.null(init[["theta"]]))
    #   init$theta <- sqrt(upper * lower)
    
    if(settings$checkHom){
      rKI <- TRUE #return.Ki
    }else{
      rKI <- FALSE
    } 
    modHom <- mleHomGP(X = list(X0 = X0, Z0 = Z0, mult = mult), Z = Z, lower = lower,
                       known = list(theta = known[["theta"]], g = known$g_H, beta0 = known$beta0),
                       upper = upper, init = list(theta = init$theta, g = g_init), covtype = covtype, maxit = maxit,
                       noiseControl = list(g_bounds = c(noiseControl$g_min, noiseControl$g_max)), eps = eps,
                       settings = list(return.Ki = rKI))
    
    if(is.null(known[["theta"]]))
      init$theta <- modHom$theta
    
    if(is.null(init$Delta)){
      predHom <- suppressWarnings(predict(x = X0, object = modHom)$mean)
      nugs_est <- (Z - rep(predHom, times = mult))^2 #squared deviation from the homoskedastic prediction mean to the actual observations
      nugs_est <-  nugs_est / modHom$nu_hat  # to be homegeneous with Delta
      
      if(logN){
        nugs_est <- pmax(nugs_est, .Machine$double.eps) # to avoid problems on deterministic test functions
        nugs_est <- log(nugs_est)
      }
      
      nugs_est0 <- drop(fast_tUY2(mult, nugs_est))/mult # average
      
    }else{
      nugs_est0 <- init$Delta
    }
    
    if(constrThetas){
      noiseControl$lowerTheta_g <- modHom$theta
    }
    
    if(settings$initStrategy == 'simple'){
      if(logN){
        init$Delta <- rep(log(modHom$g), nrow(X0))
      }else{
        init$Delta <- rep(modHom$g, nrow(X0))
      }
    }
    if(settings$initStrategy == 'residuals')
      init$Delta <- nugs_est0
    
  }
  
  
  if((is.null(init$theta_g) && is.null(init$k_theta_g)) || is.null(init$g)){
    
    ## B) Homegeneous noise process
    if(jointThetas){
      if(is.null(init$k_theta_g)){
        init$k_theta_g <- 1
        init$theta_g <- init$theta
      }else{
        init$theta_g <- init$k_theta_g * init$theta
      }
      
      if(is.null(noiseControl$lowerTheta_g)){
        noiseControl$lowerTheta_g <- init$theta_g - eps
      }
      
      if(is.null(noiseControl$upperTheta_g)){
        noiseControl$upperTheta_g <- init$theta_g + eps
      }
    }
    
    if(!jointThetas && is.null(init$theta_g)){
      init$theta_g <- init$theta
    }
    
    if(is.null(noiseControl$lowerTheta_g)){
      noiseControl$lowerTheta_g <- lower
    }
    
    if(is.null(noiseControl$upperTheta_g)){
      noiseControl$upperTheta_g <- upper
    }
    
    ## If an homegeneous process of the mean has already been computed, it is used for estimating the parameters of the noise process
    if(exists("nugs_est")){
      
      if(is.null(init$g)){
        mean_var_replicates_nugs <- mean((fast_tUY2(mult, (nugs_est - rep(nugs_est0, times = mult))^2)/mult))
        init$g <- mean_var_replicates_nugs / var(nugs_est0)
      }
      
      modNugs <- mleHomGP(X = list(X0 = X0, Z0 = nugs_est0, mult = mult), Z = nugs_est,
                          lower = noiseControl$lowerTheta_g, upper = noiseControl$upperTheta_g,
                          init = list(theta = init$theta_g, g =  init$g), covtype = covtype, noiseControl = noiseControl,
                          maxit = maxit, eps = eps, settings = list(return.Ki = F))
      prednugs <- suppressWarnings(predict(x = X0, object = modNugs))
      
    }else{
      if(!exists("nugs_est0")) nugs_est0 <- init$Delta
      
      if(is.null(init$g)){
        init$g <- 0.05
      }
      
      modNugs <- mleHomGP(X = list(X0 = X0, Z0 = nugs_est0, mult = rep(1, nrow(X0))), Z = nugs_est0,
                          lower = noiseControl$lowerTheta_g, upper = noiseControl$upperTheta_g,
                          init = list(theta = init$theta_g, g =  init$g), covtype = covtype, noiseControl = noiseControl,
                          maxit = maxit, eps = eps, settings = list(return.Ki = F))
      prednugs <- suppressWarnings(predict(x = X0, object = modNugs))
    }
    
    if(settings$initStrategy == 'smoothed')
      init$Delta <- prednugs$mean
    
    if(is.null(known$g))
      init$g <- modNugs$g
    
    if(jointThetas && is.null(init$k_theta_g))
      init$k_theta_g <- 1
    
    if(!jointThetas && is.null(init$theta_g))
      init$theta_g <- modNugs$theta
    
  }
  
  if(is.null(noiseControl$lowerTheta_g)){
    noiseControl$lowerTheta_g <- lower
  }
  
  if(is.null(noiseControl$upperTheta_g)){
    noiseControl$upperTheta_g <- upper
  }
  
  ### Start of optimization of the log-likelihood
  fn <- function(par, X0, Z0, Z, mult, Delta = NULL, theta = NULL, g = NULL, k_theta_g = NULL, theta_g = NULL, logN = FALSE, SiNK = FALSE,
                 beta0 = NULL, pX = NULL, hom_ll, env){
    
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      idx <- length(init$theta)
      theta <- par[1:idx]
      idx <- idx + 1
    }
    if(is.null(Delta)){
      Delta <- par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
    }
    if(jointThetas && is.null(k_theta_g)){
      k_theta_g <- par[idx]
      idx <- idx + 1
    }
    if(!jointThetas && is.null(theta_g)){
      theta_g <- par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
    }
    if(is.null(g)){
      g <- par[idx]
      idx <- idx + 1
    }
    if(idx != (length(par) + 1))
      pX <- matrix(par[idx:length(par)], ncol = ncol(X0))
    
    loglik <- logLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
                        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX, covtype = covtype, eps = eps, SiNK_eps = SiNK_eps, penalty = penalty,
                        hom_ll = hom_ll, env = env, trace = trace)
    
    if(!is.null(env) && !is.na(loglik)){
      if(is.null(env$max_loglik) || loglik > env$max_loglik){
        env$max_loglik <- loglik
        env$arg_max <- par
      }
    } 
    
    return(loglik)
  }
  
  
  gr <- function(par, X0, Z0, Z, mult, Delta = NULL, theta = NULL, g = NULL, k_theta_g = NULL, theta_g = NULL, logN = FALSE, SiNK = FALSE,
                 beta0 = NULL, pX = NULL, hom_ll, env){
    
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
    }
    if(is.null(Delta)){
      Delta <- par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
    }
    if(jointThetas && is.null(k_theta_g)){
      k_theta_g <- par[idx]
      idx <- idx + 1
    }
    if(!jointThetas && is.null(theta_g)){
      theta_g <- par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
    }
    if(is.null(g)){
      g <- par[idx]
      idx <- idx + 1
    }
    if(idx != (length(par) + 1))
      pX <- matrix(par[idx:length(par)], ncol = ncol(X0))
    
    # c(grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, x = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, x = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, x = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, x = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX)
    # )
    # grad(fn, x = par, X0 = X0, Z0 = Z0, Z = Z, mult = mult, logN = logN, SiNK = SiNK, method.args=list(r = 6))
    
    return(dlogLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
                      logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX, components = components, covtype = covtype, eps = eps, SiNK_eps = SiNK_eps,
                      penalty = penalty, hom_ll = hom_ll, env = env))
    
  }
  
  ## Pre-processing for heterogeneous fit
  parinit <- lowerOpt <- upperOpt <- NULL
  
  if(trace > 2)
    cat("Initial value of the parameters:\n")
  
  if(is.null(known[["theta"]])){
    parinit <- init$theta
    lowerOpt <- lower
    upperOpt <- upper
    if(trace > 2) cat("Theta: ", init$theta, "\n")
  }
  
  if(is.null(known$Delta)){
    
    if(is.null(noiseControl$lowerDelta) || length(noiseControl$lowerDelta) != n){
      if(logN){
        noiseControl$lowerDelta <- log(eps)
      }else{
        noiseControl$lowerDelta <- eps
      }
    }
    
    if(length(noiseControl$lowerDelta) == 1){
      noiseControl$lowerDelta <- rep(noiseControl$lowerDelta, n)
    }
    
    if(is.null(noiseControl$g_max)) noiseControl$g_max <- 1e2 
    
    if(is.null(noiseControl$upperDelta) || length(noiseControl$upperDelta) != n){
      if(logN){
        noiseControl$upperDelta <- log(noiseControl$g_max)
      }else{
        noiseControl$upperDelta <- noiseControl$g_max
      }
    }
    
    if(length(noiseControl$upperDelta) == 1){
      noiseControl$upperDelta <- rep(noiseControl$upperDelta, n)
    }
    
    ## For now, only the values at pX are kept
    ## It could be possible to fit a pseudo-input GP to all of Delta
    if("pX" %in% components){
      init$Delta <- init$Delta[idcs_pX]
      noiseControl$lowerDelta <- noiseControl$lowerDelta[idcs_pX]
      noiseControl$upperDelta <- noiseControl$upperDelta[idcs_pX]
    }
    
    lowerOpt <- c(lowerOpt, noiseControl$lowerDelta)
    upperOpt <- c(upperOpt, noiseControl$upperDelta)
    parinit <- c(parinit, init$Delta)
    
    if(trace > 2) cat("Delta: ", init$Delta, "\n")
  }
  
  if(jointThetas && is.null(known$k_theta_g)){
    parinit <- c(parinit, init$k_theta_g)
    lowerOpt <- c(lowerOpt, noiseControl$k_theta_g_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$k_theta_g_bounds[2])
    if(trace > 2) cat("k_theta_g: ", init$k_theta_g, "\n")
  }
  
  if(!jointThetas && is.null(known$theta_g)){
    parinit <- c(parinit, init$theta_g)
    lowerOpt <- c(lowerOpt, noiseControl$lowerTheta_g)
    upperOpt <- c(upperOpt, noiseControl$upperTheta_g)
    if(trace > 2) cat("theta_g: ", init$theta_g, "\n")
  }
  
  if(is.null(known$g)){
    parinit <- c(parinit, init$g)
    if(trace > 2) cat("g: ", init$g, "\n")
    
    lowerOpt <- c(lowerOpt, noiseControl$g_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$g_bounds[2])
  }
  
  if("pX" %in% components){
    parinit <- c(parinit, as.vector(init$pX))
    lowerOpt <- c(lowerOpt, as.vector(rep(noiseControl$lowerpX, times = nrow(init$pX))))
    upperOpt <- c(upperOpt, as.vector(rep(noiseControl$upperpX, times = nrow(init$pX))))
    if(trace > 2) cat("pX: ", as.vector(init$pX), "\n")
  }
  
  
  ## Case when some parameters need to be estimated
  mle_par <- known # Store infered and known parameters
  if(!is.null(components)){
    
    if(!is.null(modHom)){
      hom_ll <- modHom$ll
    }else{
      ## Compute reference homoskedastic likelihood, with fixed theta for speed
      modHom_tmp <- mleHomGP(X = list(X0 = X0, Z0 = Z0, mult = mult), Z = Z, lower = lower, upper = upper, upper,
                             known = list(theta = known[["theta"]], g = known$g_H, beta0 = known$beta0), covtype = covtype, init = init,
                             noiseControl = list(g_bounds = c(noiseControl$g_min, noiseControl$g_max)), eps = eps,
                             settings = list(return.Ki = F))
      
      hom_ll <- modHom_tmp$ll
    } 
    
    ## Maximization of the log-likelihood
    envtmp <- environment()
    out <- try(optim(par = parinit, fn = fn, gr = gr, method = "L-BFGS-B", lower = lowerOpt, upper = upperOpt, X0 = X0, Z0 = Z0, Z = Z,
                     mult = mult, logN = logN, SiNK = SiNK,
                     Delta = known$Delta, theta = known[["theta"]], g = known$g, k_theta_g = known$k_theta_g, theta_g = known$theta_g,
                     pX = known$pX, beta0 = known$beta0, hom_ll = hom_ll, env = envtmp,
                     control = list(fnscale = -1, maxit = maxit, factr = settings$factr, pgtol = settings$pgtol)))
    
    ## Catch errors when at least one likelihood evaluation worked
    if(is(out, "try-error"))
      out <- list(par = envtmp$arg_max, value = envtmp$max_loglik, counts = NA,
                  message = "Optimization stopped due to NAs, use best value so far")
    
    ## Temporary
    if(trace > 0){
      print(out$message)
      cat("Number of variables at boundary values:" , length(which(out$par == upperOpt)) + length(which(out$par == lowerOpt)), "\n")
    }
    
    if(trace > 1)
      cat("Name | Value | Lower bound | Upper bound \n")
    
    ## Post-processing
    idx <- 1
    if(is.null(known[["theta"]])){
      mle_par$theta <- out$par[1:length(init$theta)]
      idx <- idx + length(init$theta)
      if(trace > 1) cat("Theta |", mle_par$theta, " | ", lower, " | ", upper, "\n")
    }
    if(is.null(known$Delta)){
      mle_par$Delta <- out$par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
      if(trace > 1){
        for(ii in 1:ceiling(length(mle_par$Delta)/5)){
          i_tmp <- (1 + 5*(ii-1)):min(5*ii, length(mle_par$Delta))
          if(logN) cat("Delta |", mle_par$Delta[i_tmp], " | ", pmax(log(eps * mult[i_tmp]), init$Delta[i_tmp] - log(1000)), " | ", init$Delta[i_tmp] + log(100), "\n")
          if(!logN) cat("Delta |", mle_par$Delta[i_tmp], " | ", pmax(mult[i_tmp] * eps, init$Delta[i_tmp] / 1000), " | ", init$Delta[i_tmp] * 100, "\n")
        }
      }
    }
    if(jointThetas){
      if(is.null(known$k_theta_g)){
        mle_par$k_theta_g <- out$par[idx]
        idx <- idx + 1
      } 
      mle_par$theta_g <- mle_par$k_theta_g * mle_par$theta
      
      if(trace > 1) cat("k_theta_g |", mle_par$k_theta_g, " | ", noiseControl$k_theta_g_bounds[1], " | ", noiseControl$k_theta_g_bounds[2], "\n")
    }
    if(!jointThetas && is.null(known$theta_g)){
      mle_par$theta_g <- out$par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
      if(trace > 1) cat("theta_g |", mle_par$theta_g, " | ", noiseControl$lowerTheta_g, " | ", noiseControl$upperTheta_g, "\n")
    }
    if(is.null(known$g)){
      mle_par$g <- out$par[idx]
      idx <- idx + 1
      if(trace > 1) cat("g |", mle_par$g, " | ", noiseControl$g_bounds[1], " | ", noiseControl$g_bounds[2], "\n")
    }
    if(idx != (length(out$par) + 1)){
      mle_par$pX <- matrix(out$par[idx:length(out$par)], ncol = ncol(X0))
      if(trace > 1) cat("pX |", as.vector(mle_par$pX), " | ", as.vector(rep(noiseControl$lowerpX, times = nrow(init$pX))), " | ", as.vector(rep(noiseControl$upperpX, times = nrow(init$pX))), "\n")
    }
    
  }else{
    out <- list(message = "All hyperparameters given, no optimization \n", count = 0, value = NULL)
  }
  
  ## Computation of nu2
  
  if(penalty){
    ll_non_pen <- logLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = mle_par$Delta, theta = mle_par$theta, g = mle_par$g, k_theta_g = mle_par$k_theta_g, theta_g = mle_par$theta_g,
                            logN = logN, SiNK = SiNK, beta0 = mle_par$beta0, pX = mle_par$pX, covtype = covtype, eps = eps, SiNK_eps = SiNK_eps, penalty = FALSE, hom_ll = NULL, trace = trace)
  }else{
    ll_non_pen <- out$value
  }
  if(!is.null(modHom)){
    if(modHom$ll >= ll_non_pen){
      if(trace >= 0) cat("Homoskedastic model has higher log-likelihood: \n", modHom$ll, " compared to ", ll_non_pen, "\n")
      if(settings$checkHom){
        if(trace >= 0) cat("Return homoskedastic model \n")
        return(modHom)
      }
    }
  }
  
  if(is.null(mle_par$pX)){
    Cg <- cov_gen(X1 = X0, theta = mle_par$theta_g, type = covtype)
    Kgi <- chol2inv(chol(Cg + diag(eps + mle_par$g/mult)))
    
    nmean <- drop(rowSums(Kgi) %*% mle_par$Delta / sum(Kgi)) ## ordinary kriging mean
    
    nu_hat_var <- max(eps, drop(crossprod(mle_par$Delta - nmean, Kgi) %*% (mle_par$Delta - nmean))/length(mle_par$Delta))
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = mle_par$pX, theta_g = mle_par$theta_g, g = mle_par$g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * Cg %*% (Kgi %*% (mle_par$Delta - nmean))
    }else{
      M <- Cg %*% (Kgi %*% (mle_par$Delta - nmean))
    }
  }else{
    Kgi <- chol2inv(chol(add_diag(cov_gen(X1 = mle_par$pX, theta = mle_par$theta_g, type = covtype), eps + mle_par$g/mult)))
    
    kg <- cov_gen(X1 = X0, X2 = mle_par$pX, theta = mle_par$theta_g, type = covtype)
    
    nmean <- drop(rowSums(Kgi) %*% mle_par$Delta / sum(Kgi)) ## ordinary kriging mean
    
    nu_hat_var <- max(eps, drop(crossprod(mle_par$Delta - nmean, Kgi) %*% (mle_par$Delta - nmean))/length(mle_par$Delta))
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = mle_par$pX, theta_g = mle_par$theta_g, g = mle_par$g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * kg %*% (Kgi %*% (mle_par$Delta - nmean))
    }else{
      M <- kg %*% (Kgi %*% (mle_par$Delta - nmean))
    }
  }
  
  
  Lambda <- drop(nmean + M)
  
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  
  LambdaN <- rep(Lambda, times = mult)
  
  Ki <- chol2inv(chol(add_diag(cov_gen(X1 = X0, theta = mle_par$theta, type = covtype), Lambda/mult + eps))) 
  
  if(is.null(known$beta0))
    mle_par$beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - mle_par$beta0, Ki) %*% (Z0 - mle_par$beta0))
  
  nu2 <- 1/length(Z) * (crossprod((Z - mle_par$beta0)/LambdaN, Z - mle_par$beta0) - crossprod((Z0 - mle_par$beta0) * mult/Lambda, Z0 - mle_par$beta0) + psi_0)
  
  res <- list(theta = mle_par$theta, Delta = mle_par$Delta, nu_hat = as.numeric(nu2), beta0 = mle_par$beta0,
              k_theta_g = mle_par$k_theta_g, theta_g = mle_par$theta_g, g = mle_par$g, nmean = nmean, Lambda = Lambda,
              ll = out$value, ll_non_pen = ll_non_pen, nit_opt = out$counts, logN = logN, SiNK = SiNK, covtype = covtype, pX = mle_par$pX, msg = out$message,
              X0 = X0, Z0 = Z0, Z = Z, mult = mult, trendtype = trendtype, eps = eps,
              nu_hat_var = nu_hat_var, call = match.call(),
              used_args = list(noiseControl = noiseControl, settings = settings, lower = lower, upper = upper, known = known),
              time = proc.time()[3] - tic)
  if(SiNK){
    res <- c(res, list(SiNK_eps = SiNK_eps))
  }
  
  if(settings$return.matrices){
    res <- c(res, list(Ki = Ki, Kgi = Kgi))
  }
  
  if(settings$return.hom){
    res <- c(res, list(modHom = modHom, modNugs = modNugs))
  }
  
  class(res) <- "hetGP"
  
  return(res)
  
}

if(!isGeneric("predict")) {
  setGeneric(name = "predict",
             def = function(object, ...) standardGeneric("predict")
  )
}

#'Gaussian process predictions using a heterogeneous noise GP object (of class \code{hetGP}) 
#' @param x matrix of designs locations to predict at (one point per row)
#' @param object an object of class \code{hetGP}; e.g., as returned by \code{\link[hetGP]{mleHetGP}}
#' @param noise.var should the variance of the latent variance process be returned?
#' @param xprime optional second matrix of predictive locations to obtain the predictive covariance matrix between \code{x} and \code{xprime}
#' @param nugs.only if \code{TRUE}, only return noise variance prediction
#' @param ... no other argument for this method.
#' @return list with elements
#' \itemize{
#' \item \code{mean}: kriging mean;
#' \item \code{sd2}: kriging variance (filtered, e.g. without the nugget values)
#' \item \code{nugs}: noise variance prediction
#' \item \code{sd2_var}: (returned if \code{noise.var = TRUE}) kriging variance of the noise process (i.e., on log-variances if \code{logN = TRUE})
#' \item \code{cov}: (returned if \code{xprime} is given) predictive covariance matrix between \code{x} and \code{xprime}
#' }
#' @details The full predictive variance corresponds to the sum of \code{sd2} and \code{nugs}.
#' See \code{\link[hetGP]{mleHetGP}} for examples.
#' @method predict hetGP 
#' @export
predict.hetGP <- function(object, x, noise.var = FALSE, xprime = NULL, nugs.only = FALSE, ...){
  
  if(is.null(dim(x))){
    x <- matrix(x, nrow = 1)
    if(ncol(x) != ncol(object$X0)) stop("x is not a matrix")
  }
  
  if(!is.null(xprime) && is.null(dim(xprime))){
    xprime <- matrix(xprime, nrow = 1)
    if(ncol(xprime) != ncol(object$X0)) stop("xprime is not a matrix")
  }
  
  if(is.null(object$Kgi)){
    if(is.null(object$pX)){
      Cg <- cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype)
    }else{
      Cg <- cov_gen(X1 = object$pX, theta = object$theta_g, type = object$covtype)
    }
    
    object$Kgi <- chol2inv(chol(add_diag(Cg, object$eps + object$g/object$mult)))
  }
  
  if(is.null(object$pX)){
    kg <- cov_gen(X1 = x, X2 = object$X0, theta = object$theta_g, type = object$covtype)
  }else{
    kg <- cov_gen(X1 = x, X2 = object$pX, theta = object$theta_g, type = object$covtype)
  }
  
  if(is.null(object$Ki))
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps)))
  
  if(object$SiNK){
    M <-  1/rho_AN(xx = x, X0 = object$pX, theta_g = object$theta_g, g = object$g,
                   type = object$covtype, SiNK_eps = object$SiNK_eps, eps = object$eps, mult = object$mult) * kg %*% (object$Kgi %*% (object$Delta - object$nmean))
  }else{
    M <- kg %*% (object$Kgi %*% (object$Delta - object$nmean))
  }
  
  if(object$logN){
    nugs <- object$nu_hat * exp(drop(object$nmean + M))
  }else{
    nugs <- object$nu_hat * pmax(0, drop(object$nmean + M))
  }
  
  if(nugs.only){
    return(list(nugs = nugs))
  }
  
  if(noise.var){
    if(is.null(object$nu_hat_var))
      object$nu_hat_var <- max(object$eps, drop(crossprod(object$Delta - object$nmean, object$Kgi) %*% (object$Delta - object$nmean))/length(object$Delta)) ## To avoid 0 variance
    sd2var <- object$nu_hat * object$nu_hat_var* drop(1 - fast_diag(kg, tcrossprod(object$Kgi, kg)) + (1 - tcrossprod(rowSums(object$Kgi), kg))^2/sum(object$Kgi))
  }else{
    sd2var <- NULL
  }
  
  kx <- cov_gen(X1 = x, X2 = object$X0, theta = object$theta, type = object$covtype)
  
  if(object$trendtype == 'SK'){
    sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)))
  }else{
    sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
  }
  
  ## In case of numerical errors, some sd2 values may become negative
  if(any(sd2 < 0)){
    # object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps))
    # 
    # if(object$trendtype == 'SK'){
    #   sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)))
    # }else{
    #   sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
    # }
    sd2 <- pmax(0, sd2)
    warning("Numerical errors caused some negative predictive variances to be thresholded to zero. Consider using ginv via rebuild.hetGP")
  }
  
  if(!is.null(xprime)){
    kxprime <- cov_gen(X1 = object$X0, X2 = xprime, theta = object$theta, type = object$covtype)
    if(object$trendtype == 'SK'){
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime) 
      }else{
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*% kxprime))
      }
    }else{
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki))
      }else{
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*%  kxprime) + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki))
      }
    }
  }else{
    cov = NULL
  }
  
  return(list(mean = drop(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0))),
              sd2 = sd2,
              nugs = nugs,
              sd2var = sd2var,
              cov = cov))
}


#' @method summary hetGP
#' @export
summary.hetGP <- function(object,...){
  ans <- object
  class(ans) <- "summary.hetGP"
  ans
}

#' @export
print.summary.hetGP <- function(x, ...){
  cat("N = ", length(x$Z), " n = ", length(x$Z0), " d = ", ncol(x$X0), "\n")
  cat(x$covtype, " covariance lengthscale values of the main process: ", x$theta, "\n")
  cat("Variance/scale hyperparameter: ", x$nu_hat, "\n")
  
  cat("Summary of Lambda values: \n")
  print(summary(x$Lambda))
  
  if(x$trendtype == "SK"){
    cat("Given constant trend value: ", x$beta0, "\n")
  }else{
    cat("Estimated constant trend value: ", x$beta0, "\n")
  }
  
  if(x$logN){
    cat(x$covtype, " covariance lengthscale values of the log-noise process: ", x$theta_g, "\n")
    cat("Nugget of the log-noise process: ", x$g, "\n")
    cat("Estimated constant trend value of the log-noise process: ", x$nmean, "\n")
  }else{
    cat(x$covtype, " covariance lengthscale values of the log-noise process: ", x$theta_g, "\n")
    cat("Nugget of the noise process: ", x$g, "\n")
    cat("Estimated constant trend value of the noise process: ", x$nmean, "\n")
  }
  
  cat("MLE optimization: \n", "Log-likelihood = ", x$ll, "; Nb of evaluations (obj, gradient) by L-BFGS-B: ", x$nit_opt, "; message: ", x$msg, "\n")
  
}

#' @method print hetGP
#' @export
print.hetGP <- function(x, ...){
  print(summary(x))
}

#' @method plot hetGP
#' @export
plot.hetGP <- function(x, ...){
  LOOpreds <- LOO_preds(x)
  plot(x$Z, LOOpreds$mean[rep(1:nrow(x$X0), times = x$mult)], xlab = "Observed values", ylab = "Predicted values",
       main = "Leave-one-out predictions")
  arrows(x0 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.05),
         x1 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.95),
         y0 = LOOpreds$mean, length = 0, col = "blue")
  points(x$Z0[which(x$mult > 1)], LOOpreds$mean[which(x$mult > 1)], pch = 20, col = 2)
  abline(a = 0, b = 1, lty = 3)
  legend("topleft", pch = c(1, 20, NA), lty = c(NA, NA, 1), col = c(1, 2, 4),
         legend = c("observations", "averages (if > 1 observation)", "LOO prediction interval"))
}

## ' Rebuild inverse covariance matrices of \code{hetGP} (e.g., if exported without inverse matrices \code{Kgi} and/or \code{Ki})
## ' @param object \code{hetGP} model without slots \code{Ki} and/or \code{Kgi} (inverse covariance matrices)
## ' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
#' @method rebuild hetGP
#' @rdname ExpImp
#' @export
rebuild.hetGP <- function(object, robust = FALSE){
  
  if(is.null(object$pX)){
    Cg <- cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype)
  }else{
    Cg <- cov_gen(X1 = object$pX, theta = object$theta_g, type = object$covtype)
  }
  
  if(robust){
    object$Kgi <- ginv(add_diag(Cg, object$eps + object$g/object$mult))
    
    object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps))
  }else{
    object$Kgi <- chol2inv(chol(add_diag(Cg, object$eps + object$g/object$mult)))
    
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps)))
  }
  return(object)
}


# ###############################################################################
# ## Appendix: low level functions
# ###############################################################################

#' Prepare data for use with \code{\link[hetGP]{mleHetGP}}, in particular to find replicated observations
#' @title Data preprocessing 
#' @param X matrix of design locations, one point per row
#' @param Z vector of observations at \code{X}
#' @param return.Zlist to return \code{Zlist}, see below
#' @param rescale if \code{TRUE}, the inputs are rescaled to the unit hypercube
#' @param normalize if \code{TRUE}, the outputs are centered and normalized
#' @param inputBounds optional matrix of known boundaries in original input space, of size 2 times \code{ncol(X)}. 
#' If not provided, and \code{rescale == TRUE}, it is estimated from the data.   
#' @return A list with the following elements that can be passed to the main fitting functions, e.g., \code{\link{mleHetGP}} and \code{\link{mleHomGP}}
#' \itemize{
#' \item \code{X0} matrix with unique designs locations, one point per row,
#' \item \code{Z0} vector of averaged observations at \code{X0},
#' \item \code{mult} number of replicates at \code{X0},
#' \item \code{Z} vector with all observations, sorted according to \code{X0},
#' \item \code{Zlist} optional list, each element corresponds to observations at a design in \code{X0},
#' \item \code{inputBounds} optional matrix, to rescale back to the original input space,
#' \item \code{outputStats} optional vector, with mean and variance of the original outputs.
#' }
#' @details Replicates are searched based on character representation, using \code{\link[base]{unique}}.
#' @examples 
#' ##------------------------------------------------------------
#' ## Find replicates on the motorcycle data
#' ##------------------------------------------------------------
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' 
#' data_m <- find_reps(X, Z)
#' 
#' # Initial data
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' # Display mean values
#' points(data_m$X0, data_m$Z0, pch = 20)
#' @export
find_reps <- function(X, Z, return.Zlist = TRUE, rescale = FALSE, normalize = FALSE, inputBounds = NULL){
  if(is.null(dim(X)))
    X <- matrix(X, ncol = 1)
  
  if(nrow(X) == 1){
    if(return.Zlist)
      return(list(X0 = X, Z0 = Z, mult = 1, Z = Z,
                  Zlist = list(Z)))
    return(list(X0 = X, Z0 = Z, mult = 1, Z = Z))
  }
  
  if(rescale){
    if(is.null(inputBounds))
      inputBounds <- apply(X, 2, range)
    X <- (X - matrix(inputBounds[1,], nrow = nrow(X), ncol = ncol(X), byrow = TRUE)) %*% diag(1/(inputBounds[2,] - inputBounds[1,]), ncol(X)) 
  }
  
  outputStats <- NULL
  if(normalize){
    outputStats <- c(mean(Z), var(Z))
    Z <- (Z - outputStats[1])/sqrt(outputStats[2])
  }
  
  X0 <- unique(X)
  if(nrow(X) == nrow(X0)){
    if(return.Zlist)
      return(list(X0 = X, Z0 = Z, mult = rep(1, length(Z)), Z = Z, Zlist = as.list(Z),
                  inputBounds = inputBounds, outputStats = outputStats))
    return(list(X0 = X, Z0 = Z, mult = rep(1, length(Z)), Z = Z,
                inputBounds = inputBounds, outputStats = outputStats))
  }
  
  corresp <- find_corres(X0, X)
  Zlist <- split(Z, corresp)
  mult <- as.numeric(unlist(lapply(Zlist, length)))
  
  if(return.Zlist)
    return(list(X0 = X0, Z0 = unlist(lapply(Zlist, mean)), mult = mult, Z = unlist(Zlist),
                Zlist = Zlist, inputBounds = inputBounds, outputStats = outputStats))
  return(list(X0 = X0, Z0 = unlist(lapply(Zlist, mean)), mult = mult, Z = unlist(Zlist), inputBounds = inputBounds,
              outputStats = outputStats))
}

#' @title Generic Log-likelihood function
#' This function can be used to compute loglikelihood for homGP/hetGP models
#' @details For hetGP, this is not the joint log-likelihood, only the likelihood of the mean process.
#' @param X0 unique designs
#' @param Z0 averaged observations
#' @param Z replicated observations (sorted with respect to X0)
#' @param mult number of replicates at each Xi
#' @param Delta vector of nuggets corresponding to each X0i or pXi, that are smoothed to give Lambda
#' @param logN should exponentiated variance be used
#' @param theta scale parameter for the mean process, either one value (isotropic) or a vector (anistropic)
#' @param k_theta_g constant used for linking nuggets lengthscale to mean process lengthscale, i.e., theta_g[k] = k_theta_g * theta[k], alternatively theta_g can be used
#' @param theta_g either one value (isotropic) or a vector (anistropic), alternative to using k_theta_g
#' @param g nugget of the nugget process
#' @param beta0 mean, if not provided, the MLE estimator is used
#' @param eps minimal value of elements of Lambda
#' @param covtype covariance kernel type
#' @keywords internal
#' @export
logLikH <- function(X0, Z0, Z, mult, theta, g, Delta = NULL, k_theta_g = NULL, theta_g = NULL, logN = FALSE,
                    beta0 = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian"){
  
  if(is.null(Delta)){
    return(logLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, beta0 = beta0, covtype = covtype, eps = eps))
  }else{
    if(!is.null(k_theta_g)) theta_g <- NULL
    return(logLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, Delta = Delta, g = g, k_theta_g = k_theta_g,
                     theta_g = theta_g, logN = logN, penalty = FALSE, hom_ll = NULL, trace = 0))
  }
}

#' @title Score and RMSE function
#' To asses the performance of the prediction, this function computes the root mean squared error and proper score function (also known as negative log-probability density).
#' @references 
#' T. Gneiting, and A. Raftery (2007). Strictly Proper Scoring Rules, Prediction, and Estimation, Journal of the American Statistical Association, 102(477), 359-378.
#' @param model \code{homGP} or \code{hetGP} model, including inverse matrices
#' @param Xtest matrix of new design locations
#' @param Ztest corresponding vector of observations, or alternatively, 
#' a matrix of size [nrow(Xtest) x number of replicates], a list of size nrow(Xtest) with a least one value per element
#' @param return.rmse if \code{TRUE}, return the root mean squared error
#' @export
scores <- function(model, Xtest, Ztest, return.rmse = FALSE){
  p <- predict(model, Xtest)
  ps2 <- p$sd2 + p$nugs
  
  if(is.list(Ztest)){
    s2 <- m <- Ztest
    for(i in 1:length(Ztest)) {
      m[[i]] <- rep(p$mean[i], length(m[[i]]))
      s2[[i]] <- rep(ps2[i], length(s2[[i]]))
    }
    m <- unlist(t(m))
    s2 <- unlist(t(s2))
    se <- (unlist(t(Ztest)) - m)^2
    sc <- - se/s2 - log(s2)
  }
  
  if(is.vector(Ztest, mode = "numeric")){
    se <- (Ztest - p$mean)^2
    sc <- -se/ps2 - log(ps2)
  }
  
  if(is.matrix(Ztest)){
    nc <- ncol(Ztest)
    m <- as.numeric(t(matrix(rep(p$mean, nc), ncol = nc)))
    s2 <- as.numeric(t(matrix(rep(ps2, nc), ncol = nc)))
    se <- (unlist(t(Ztest)) - m)^2
    sc <- - se/s2 - log(s2)
  }
  
  if(return.rmse) return(list(sc = mean(sc), rmse = sqrt(mean(se))))
  return(mean(sc)) 
}

#' Defines lower bound for lengthscale parameters based on a low quantile of non-zero distances between points in the design.
#' @param X design matrix
#' @param min_cor minimal correlation between two design points at the defined quantile distance, default to 0.01
#' @param max_cor maximal correlation between two design points at the defined (1-p) quantile distance, default to 0.5
#' @param p quantile on distances, default to 0.1
#' @param covtype covariance function used
#' @importFrom stats quantile uniroot
#' @noRd
auto_bounds <- function(X, min_cor = 0.01, max_cor = 0.5, covtype = "Gaussian", p = 0.05){
  Xsc <- find_reps(X, rep(1, nrow(X)), rescale = T) # rescaled distances
  
  dists <- distance_cpp(Xsc$X0) # find 2 closest points
  repr_low_dist <- quantile(x = dists[lower.tri(dists)], probs = p, names = FALSE) # (quantile on squared Euclidean distances)
  repr_lar_dist <- quantile(x = dists[lower.tri(dists)], probs = 1-p, names = FALSE)
  
  if(covtype == "Gaussian"){
    theta_min <- - repr_low_dist/log(min_cor)
    theta_max <- - repr_lar_dist/log(max_cor)
    return(list(lower = theta_min * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,])^2,
                upper = theta_max * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,])^2))
  }else{
    tmpfun <- function(theta, repr_dist, covtype, value){
      cov_gen(matrix(sqrt(repr_dist/ncol(X)), ncol = ncol(X)), matrix(0, ncol = ncol(X)), type = covtype, theta = theta) - value
    }
    theta_min <- try(uniroot(tmpfun, interval = c(sqrt(.Machine$double.eps), 100), covtype = covtype, value = min_cor, 
                         repr_dist = repr_low_dist, tol = sqrt(.Machine$double.eps))$root)
    if(is(theta_min, "try-error")){
      warning("The automatic selection of lengthscales bounds was not successful. Perhaps provide lower and upper values.")
      theta_min <- 1e-2
    }
    theta_max <- try(uniroot(tmpfun, interval = c(sqrt(.Machine$double.eps), 100), covtype = covtype, value = max_cor,
                         repr_dist = repr_lar_dist, tol = sqrt(.Machine$double.eps))$root, silent = TRUE)
    if(is(theta_max, "try-error")){
      theta_max <- 5
    }
    return(list(lower = theta_min * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,]),
                upper = max(1, theta_max) * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,])))
  }
} 

# Rho function for SiNK prediction, anistropic case
## @param covtype covariance kernel type, either 'Gaussian' or 'Matern5_2'
rho_AN <- function(xx, X0, theta_g, g, sigma = 1, type = "Gaussian", SiNK_eps = 1e-4, eps = sqrt(.Machine$double.eps), mult){
  if(is.null(nrow(xx)))
    xx <- matrix(xx, nrow = 1)
  
  K <- sigma * cov_gen(X1 = X0, theta = theta_g, type = type) + diag(eps + g/mult, nrow(X0))
  
  k <- sigma * cov_gen(X1 = xx, X2 = X0, theta = theta_g, type = type)
  
  return(pmax(SiNK_eps, sqrt(diag(k %*% chol2inv(chol(K)) %*% t(k)))/sigma^2))
}

Try the hetGP package in your browser

Any scripts or data that you put into this service are public.

hetGP documentation built on Oct. 3, 2023, 1:07 a.m.