R/hetGP.R
In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication

Documented in compareGP find_reps logLikH mleHetGP mleHomGP predict.hetGP predict.homGP rebuild.hetGP rebuild.homGP scores

################################################################################
### Contains:
### - log-likelihood computations using simplified expression in case of replicates
### - maximization of log-likelihood functions
### - prediction given hyperparameters
################################################################################

################################################################################
## Part I: Homoskedastic noise
################################################################################

## Model: noisy observations with unknown homoskedastic noise
## K = nu^2 * (C + g * I)
# X0 unique designs matrix
# Z0 averaged observations at X0
# Z observations vector (all observations)
# mult number of replicates at each unique design
# theta vector of lengthscale hyperparameters (or one for isotropy)
# g noise variance for the process
# beta0 trend
# @return loglikelihood value
logLikHom <- function(X0, Z0, Z, mult, theta, g, beta0 = NULL, covtype = "Gaussian", eps = sqrt(.Machine$double.eps), env = NULL){
  n <- nrow(X0)
  N <- length(Z)
  
  # Temporarily store Cholesky transform of K in Ki
  C <- cov_gen(X1 = X0, theta = theta, type = covtype)
  if(!is.null(env)) env$C <- C
  Ki <- chol(C + diag(eps + g / mult))
  ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
  Ki <- chol2inv(Ki)
  
  if(!is.null(env)) env$Ki <- Ki
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- 1/N * ((crossprod(Z - beta0) - crossprod((Z0 - beta0) * mult, Z0 - beta0))/g + psi_0)
  
  loglik <- -N/2 * log(2*pi) - N/2 * log(psi) + 1/2 * ldetKi - (N - n)/2 * log(g) - 1/2 * sum(log(mult)) - N/2
}

# derivative of log-likelihood for logLikHom with respect to theta with all observations (Gaussian kernel)
## Model: noisy observations with unknown homoskedastic noise
## K = nu^2 * (C + g * I)
# X0  design matrix (no replicates)
# Z0 averaged observations
# Z observations vector (all observations)
# mult number of replicates per unique design point
# theta vector of lengthscale hyperparameters (or one for isotropy)
# g noise variance for the process
# beta0 trend
## @return gradient with respect to theta and g
dlogLikHom <- function(X0, Z0, Z, mult, theta, g, beta0 = NULL, covtype = "Gaussian",
                       eps = sqrt(.Machine$double.eps), components = c("theta", "g"), env = NULL){
  k <- length(Z)
  n <- nrow(X0)
  
  
  if(!is.null(env)){
    C <- env$C
    Ki <- env$Ki
  }else{
    C <- cov_gen(X1 = X0, theta = theta, type = covtype)
    Ki <- chol2inv(chol(C + diag(eps + g / mult)))
  }
  
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  Z0 <- Z0 - beta0
  Z <- Z - beta0
  
  KiZ0 <- Ki %*% Z0 ## to avoid recomputing  
  
  psi <- drop(crossprod(Z0, KiZ0))
  
  tmp1 <- tmp2 <- NULL
  
  # First component, derivative with respect to theta
  if("theta" %in% components){
    tmp1 <- rep(NA, length(theta))
    if(length(theta)==1){
      dC_dthetak <- partial_cov_gen(X1 = X0, theta = theta, type = covtype, arg = "theta_k") * C
      tmp1 <- k/2 * crossprod(KiZ0, dC_dthetak) %*% KiZ0 /((crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi) - 1/2 * trace_sym(Ki, dC_dthetak)
    }else{
      for(i in 1:length(theta)){
        dC_dthetak <- partial_cov_gen(X1 = X0[,i, drop = F], theta = theta[i], type = covtype, arg = "theta_k") * C
        tmp1[i] <- k/2 * crossprod(KiZ0, dC_dthetak) %*% KiZ0 /((crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi) - 1/2 * trace_sym(Ki, dC_dthetak)
      }
    } 
  }
  
  # Second component derivative with respect to g
  if("g" %in% components)
    tmp2 <- k/2 * ((crossprod(Z) - crossprod(Z0 * mult, Z0))/g^2 + sum(KiZ0^2/mult)) / ((crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi) - (k - n)/ (2*g) - 1/2 * sum(diag(Ki)/mult)
  
  return(c(tmp1, tmp2))
}

#' Gaussian process regression under homoskedastic noise based on maximum likelihood estimation of the 
#' hyperparameters. This function is enhanced to deal with replicated observations.
#' @title Gaussian process modeling with homoskedastic noise
#' @param X matrix of all designs, one per row, or list with elements:
#' \itemize{
#'   \item \code{X0} matrix of unique design locations, one point per row
#'   \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
#'   \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
#' } 
#' @param Z vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}
#' @param lower,upper optional bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
#' In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy) 
#' @param known optional list of known parameters, e.g., \code{beta0}, \code{theta} or \code{g}
#' @param covtype covariance kernel type, either 'Gaussian', 'Matern5_2' or 'Matern3_2', see \code{\link[hetGP]{cov_gen}}
#' @param noiseControl list with element , 
#' \itemize{
#' \item \code{g_bounds}, vector providing minimal and maximal noise to signal ratio
#' } 
#' @param init optional list specifying starting values for MLE optimization, with elements:
#' \itemize{
#'  \item \code{theta_init} initial value of the theta parameters to be optimized over (default to 10\% of the range determined with \code{lower} and \code{upper})
#'  \item \code{g_init} initial value of the nugget parameter to be optimized over (based on the variance at replicates if there are any, else \code{0.1})
#' }
#' @param maxit maximum number of iteration for L-BFGS-B of \code{\link[stats]{optim}}
#' @param eps jitter used in the inversion of the covariance matrix for numerical stability
#' @param settings list with argument \code{return.Ki}, to include the inverse covariance matrix in the object for further use (e.g., prediction).
#' Arguments \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}} (for the joint likelihood only). 
#' @return a list which is given the S3 class "\code{homGP}", with elements:
#' \itemize{
#' \item \code{theta}: maximum likelihood estimate of the lengthscale parameter(s),
#' \item \code{g}: maximum likelihood estimate of the nugget variance,
#' \item \code{trendtype}: either "\code{SK}" if \code{beta0} is given, else "\code{OK}" 
#' \item \code{beta0}: estimated trend unless given in input,
#' \item \code{nu_hat}: plugin estimator of the variance,
#' \item \code{ll}: log-likelihood value,
#' \item \code{X0}, \code{Z0}, \code{Z}, \code{mult}, \code{eps}, \code{covtype}: values given in input,
#' \item \code{call}: user call of the function
#' \item \code{used_args}: list with arguments provided in the call
#' \item \code{nit_opt}, \code{msg}: \code{counts} and \code{msg} returned by \code{\link[stats]{optim}}
#' \item \code{Ki}: inverse covariance matrix (not scaled by \code{nu_hat}) (if \code{return.Ki} is \code{TRUE} in \code{settings})
#' \item \code{time}: time to train the model, in seconds.
#' 
#'}
#' @details
#' The global covariance matrix of the model is parameterized as \code{nu_hat * (C + g * diag(1/mult)) = nu_hat * K},
#' with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices) and values of lengthscale parameters.
#' \code{nu_hat} is the plugin estimator of the variance of the process.
#' 
#' It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.
#' 
#' @seealso \code{\link[hetGP]{predict.homGP}} for predictions, \code{\link[hetGP]{update.homGP}} for updating an existing model. 
#' \code{summary} and \code{plot} functions are available as well. 
#' \code{\link[hetGP]{mleHomTP}} provide a Student-t equivalent.
#' @references 
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902. \cr \cr
#' @export
## ' @importFrom numDeriv hessian
#' @examples
#' ##------------------------------------------------------------
#' ## Example 1: Homoskedastic GP modeling on the motorcycle data
#' ##------------------------------------------------------------
#' set.seed(32)
#' 
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' 
#'  
#' model <- mleHomGP(X = X, Z = Z, lower = 0.01, upper = 100)
#'   
#' ## Display averaged observations
#' points(model$X0, model$Z0, pch = 20) 
#' xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
#' predictions <- predict(x = xgrid, object =  model)
#' 
#' ## Display mean prediction
#' lines(xgrid, predictions$mean, col = 'red', lwd = 2)
#' ## Display 95% confidence intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' ## Display 95% prediction intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)

mleHomGP <- function(X, Z, lower = NULL, upper = NULL, known = NULL,
                     noiseControl = list(g_bounds = c(sqrt(.Machine$double.eps), 1e2)),
                     init = NULL,
                     covtype = c("Gaussian", "Matern5_2", "Matern3_2"),
                     maxit = 100, eps = sqrt(.Machine$double.eps), settings = list(return.Ki = TRUE, factr = 1e7)){
  
  if(typeof(X)=="list"){
    X0 <- X$X0
    Z0 <- X$Z0
    mult <- X$mult
    if(sum(mult) != length(Z)) stop("Length(Z) should be equal to sum(mult)")
    if(is.null(dim(X0))) X0 <- matrix(X0, ncol = 1)
    if(length(Z0) != nrow(X0)) stop("Dimension mismatch between Z0 and X0")
  }else{
    if(is.null(dim(X))) X <- matrix(X, ncol = 1)
    if(nrow(X) != length(Z)) stop("Dimension mismatch between Z and X")
    elem <- find_reps(X, Z, return.Zlist = F)
    X0 <- elem$X0
    Z0 <- elem$Z0
    Z <- elem$Z
    mult <- elem$mult
  }
  
  covtype <- match.arg(covtype)
  
  if(is.null(lower) || is.null(upper)){
    auto_thetas <- auto_bounds(X = X0, covtype = covtype)
    if(is.null(lower)) lower <- auto_thetas$lower
    if(is.null(upper)) upper <- auto_thetas$upper
    if(is.null(known[["theta"]]) && is.null(init$theta)) init$theta <- sqrt(upper * lower)
  }
  if(length(lower) != length(upper)) stop("upper and lower should have the same size")
  
  ## Save time to train model
  tic <- proc.time()[3]
  
  if(is.null(settings$return.Ki)) settings$return.Ki <- TRUE
  if(is.null(noiseControl$g_bounds)) noiseControl$g_bounds <- c(sqrt(.Machine$double.eps), 1e2)
  
  g_min <- noiseControl$g_bounds[1]
  g_max <- noiseControl$g_bounds[2]
  
  beta0 <- known$beta0
  
  N <- length(Z)
  n <- nrow(X0)
  
  if(is.null(n))
    stop("X0 should be a matrix. \n")
  
  if(is.null(known[["theta"]]) && is.null(init$theta)) init$theta <- 0.9 * lower + 0.1 * upper # useful for mleHetGP
  if(is.null(known$g) && is.null(init$g)){
    if(any(mult > 2)) init$g <- mean((fast_tUY2(mult, (Z - rep(Z0, times = mult))^2)/mult)[which(mult > 2)])/var(Z0) else init$g <- 0.1
  }
  
  trendtype <- 'OK'
  if(!is.null(beta0))
    trendtype <- 'SK'
  
  ## General definition of fn and gr
  fn <- function(par, X0, Z0, Z, mult, beta0, theta, g, env){
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
    }
    
    if(is.null(g)){
      g <- par[idx]
    }
    
    loglik <- logLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, beta0 = beta0, covtype = covtype, eps = eps, env = env)
    
    if(!is.null(env) && !is.na(loglik)){
      if(is.null(env$max_loglik) || loglik > env$max_loglik){
        env$max_loglik <- loglik
        env$arg_max <- par
      }
    } 
    
    return(loglik)
  }
  
  gr <- function(par, X0, Z0, Z, mult, beta0, theta, g, env){
    idx <- 1
    components <- NULL
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
      components <- "theta"
    }
    
    if(is.null(g)){
      g <- par[idx]
      components <- c(components, "g")
    }
    return(dlogLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, beta0 = beta0, covtype = covtype, eps = eps,
                      components = components, env = env))
  }
  
  ## Both known
  envtmp <- environment()
  if(!is.null(known$g) && !is.null(known[["theta"]])){
    theta_out <- known[["theta"]]
    g_out <- known$g
    out <- list(value = logLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta_out, g = g_out, beta0 = beta0, covtype = covtype, eps = eps),
                message = "All hyperparameters given", counts = 0, time = proc.time()[3] - tic)
  }else{
    parinit <- lowerOpt <- upperOpt <- NULL
    if(is.null(known[["theta"]])){
      parinit <- init$theta
      lowerOpt <- c(lower)
      upperOpt <- c(upper)
    }
    if(is.null(known$g)){
      parinit <- c(parinit, init$g)
      lowerOpt <- c(lowerOpt, g_min)
      upperOpt <- c(upperOpt, g_max)
    }
    
    out <- try(optim(par = parinit, fn = fn, gr = gr, method = "L-BFGS-B", lower = lowerOpt, upper = upperOpt, theta = known[["theta"]], g = known$g,
                     X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0,
                     control = list(fnscale = -1, maxit = maxit, factr = settings$factr, pgtol = settings$pgtol), env = envtmp))
    ## Catch errors when at least one likelihood evaluation worked
    if(is(out, "try-error"))
      out <- list(par = envtmp$arg_max, value = envtmp$max_loglik, counts = NA,
                  message = "Optimization stopped due to NAs, use best value so far")
    
    if(is.null(known$g)) g_out <- out$par[length(out$par)] else g_out <- known$g
    if(is.null(known[["theta"]])) theta_out <- out$par[1:length(init$theta)] else theta_out <- known[["theta"]]
    
  }
  
  Ki <- chol2inv(chol(add_diag(cov_gen(X1 = X0, theta = theta_out, type = covtype), eps + g_out/ mult)))
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  nu <- 1/N * ((crossprod(Z - beta0) - crossprod((Z0 - beta0) * mult, Z0 - beta0))/g_out + psi_0)
  
  # # Get hessian of our cost function/
  # if (is.null(known[["theta"]])) {
  #   # Jacobian is more precise numerically but doesn't seem to work for some reason
  #   #hess <- jacobian(func = gr, x = out$par, theta = known[["theta"]], g = known$g,
  #   #             X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0, env = envtmp)
  #   fwrap <- function(par, ...) fn(c(par, out$par[length(out$par)]), ...)
  #   hess <- hessian(func = fwrap, x = out$par[1:(length(out$par)-1)], theta = known[["theta"]], g = known$g,
  #                   X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0, env = NULL)
  # } else {
  #   hess <- NULL
  # }
  
  res <- list(theta = theta_out, g = g_out, nu_hat = as.numeric(nu), ll = out$value, nit_opt = out$counts,
              beta0 = beta0, trendtype = trendtype, covtype = covtype, msg = out$message, eps = eps,
              X0 = X0, Z0 = Z0, Z = Z, mult = mult, call = match.call(),
              used_args = list(lower = lower, upper = upper, known = known, noiseControl = noiseControl),
              time = proc.time()[3] - tic) # hess = hess)
  
  if(settings$return.Ki) res <- c(res, list(Ki = Ki))
  
  class(res) <- "homGP"
  return(res)
}

#' @method summary homGP
#' @export
summary.homGP <- function(object,...){
  ans <- object
  class(ans) <- "summary.homGP"
  ans
}

#' @export
print.summary.homGP <- function(x, ...){
  
  cat("N = ", length(x$Z), " n = ", length(x$Z0), " d = ", ncol(x$X0), "\n")
  cat(x$covtype, " covariance lengthscale values: ", x$theta, "\n")
  
  cat("Homoskedastic nugget value: ", x$g, "\n")
  
  cat("Variance/scale hyperparameter: ", x$nu_hat, "\n")
  
  if(x$trendtype == "SK"){
    cat("Given constant trend value: ", x$beta0, "\n")
  }else{
    cat("Estimated constant trend value: ", x$beta0, "\n")
  }
  
  cat("MLE optimization: \n", "Log-likelihood = ", x$ll, "; Nb of evaluations (obj, gradient) by L-BFGS-B: ", x$nit_opt, "; message: ", x$msg, "\n")
}

#' @method print homGP
#' @export
print.homGP <- function(x, ...){
  print(summary(x))
  
}

#' @method plot homGP
#' @export
#' @importFrom graphics abline legend plot points arrows
#' @importFrom stats qnorm
plot.homGP <- function(x, ...){
  LOOpreds <- LOO_preds(x)
  plot(x$Z, LOOpreds$mean[rep(1:nrow(x$X0), times = x$mult)], xlab = "Observed values", ylab = "Predicted values",
       main = "Leave-one-out predictions")
  arrows(x0 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.05),
         x1 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.95),
         y0 = LOOpreds$mean, length = 0, col = "blue")
  points(x$Z0[which(x$mult > 1)], LOOpreds$mean[which(x$mult > 1)], pch = 20, col = 2)
  abline(a = 0, b = 1, lty = 3)
  legend("topleft", pch = c(1, 20, NA), lty = c(NA, NA, 1), col = c(1, 2, 4),
         legend = c("observations", "averages (if > 1 observation)", "LOO prediction interval"))
}

#' Gaussian process predictions using a homoskedastic noise GP object (of class \code{homGP})
#' @param x matrix of designs locations to predict at (one point per row)
#' @param object an object of class \code{homGP}; e.g., as returned by \code{\link[hetGP]{mleHomGP}}
#' @param xprime optional second matrix of predictive locations to obtain the predictive covariance matrix between \code{x} and \code{xprime}
#' @param ... no other argument for this method
#' @return list with elements
#' \itemize{
#' \item \code{mean}: kriging mean;
#' \item \code{sd2}: kriging variance (filtered, e.g. without the nugget value)
#' \item \code{cov}: predictive covariance matrix between \code{x} and \code{xprime}
#' \item \code{nugs}: nugget value at each prediction location, for consistency with \code{\link[hetGP]{mleHomGP}}.
#' }
#' @importFrom MASS ginv
#' @details The full predictive variance corresponds to the sum of \code{sd2} and \code{nugs}. See \code{\link[hetGP]{mleHomGP}} for examples.
#' @method predict homGP
#' @export
predict.homGP <- function(object, x, xprime = NULL, ...){
  if(is.null(dim(x))){
    x <- matrix(x, nrow = 1)
    if(ncol(x) != ncol(object$X0)) stop("x is not a matrix")
  }
  
  if(!is.null(xprime) && is.null(dim(xprime))){
    xprime <- matrix(xprime, nrow = 1)
    if(ncol(xprime) != ncol(object$X0)) stop("xprime is not a matrix")
  }
  
  if(is.null(object$Ki))
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps)))
  
  object$Ki <- object$Ki/object$nu_hat
  
  
  kx <- object$nu_hat * cov_gen(X1 = x, X2 = object$X0, theta = object$theta, type = object$covtype)
  
  nugs <- rep(object$nu_hat * object$g, nrow(x))
  mean <- as.vector(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0)))
  
  if(object$trendtype == 'SK'){
    sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)))
  }else{
    sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
  }
  
  ## In case of numerical errors, some sd2 values may become negative
  if(any(sd2 < 0)){
    # object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps))/object$nu_hat
    # mean <- as.vector(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0)))
    # 
    # if(object$trendtype == 'SK'){
    #   sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)))
    # }else{
    #   sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
    # }
    sd2 <- pmax(0, sd2)
    warning("Numerical errors caused some negative predictive variances to be thresholded to zero. Consider using ginv via rebuild.homGP")
  }
  
  if(!is.null(xprime)){
    kxprime <- object$nu_hat * cov_gen(X1 = object$X0, X2 = xprime, theta = object$theta, type = object$covtype)
    if(object$trendtype == 'SK'){
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime 
      }else{
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*% kxprime)
      }
    }else{
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki)
      }else{
        cov <- object$nu_hat * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*%  kxprime) + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki)
      }
    }
  }else{
    cov = NULL
  }
  
  return(list(mean = mean, sd2 = sd2, nugs = nugs, cov = cov))
}

if(!isGeneric("rebuild")) {
  setGeneric(name = "rebuild",
             def = function(object, ...) standardGeneric("rebuild")
  )
}

#' Functions to make \code{hetGP} objects lighter before exporting them, and to reverse this after import.
#' The \code{rebuild} function may also be used to obtain more robust inverse of covariance matrices using \code{\link[MASS]{ginv}}.
#' @title Import and export of hetGP objects
#' @param object \code{homGP} or \code{homTP} model without slot \code{Ki} (inverse covariance matrix),
#'  or \code{hetGP} or \code{hetTP} model without slot \code{Ki} or \code{Kgi}
#' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
## ' @param ... not used
#' @export
#' @return \code{object} with additional or removed slots.
#' @rdname ExpImp
#' @examples 
#' set.seed(32)
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' ## Model fitting
#' model <- mleHetGP(X = X, Z = Z, lower = 0.1, upper = 50)
#' 
#' # Check size
#' object.size(model)
#' 
#' # Remove internal elements, e.g., to save it
#' model <- strip(model)
#' 
#' # Check new size
#' object.size(model)
#' 
#' # Now rebuild model, and use ginv instead
#' model <- rebuild(model, robust = TRUE)
#' object.size(model)
#' 
rebuild <- function (object, robust) {
  UseMethod("rebuild", object)
}

## ' Rebuild inverse covariance matrix of \code{homGP} (e.g., if exported without \code{Ki})
## ' @param object \code{homGP} model without slot \code{Ki} (inverse covariance matrix)
## ' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
#' @method rebuild homGP
#' @rdname ExpImp
#' @export
rebuild.homGP <- function(object, robust = FALSE){
  if(robust){
    object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps))/object$nu_hat
  }else{
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$g/object$mult + object$eps)))
  }
  
  return(object)
}

#' @export
#' @rdname ExpImp
strip <- function (object) {
  # UseMethod("strip", object)
  if(!is.null(object$Ki)) object$Ki <- NULL
  if(!is.null(object$Kgi)) object$Kgi <- NULL
  if(!is.null(object$modHom)) object$modHom <- NULL
  if(!is.null(object$modNugs)) object$modNugs <- NULL
  return(object)
}


###############################################################################
## Part II: Heterogeneous GP with all options for the fit
###############################################################################

## ' log-likelihood in the anisotropic case - one lengthscale by variable
## ' Model: K = nu2 * (C + Lambda) = nu using all observations using the replicates information
## ' nu2 is replaced by its plugin estimator in the likelihood
## ' @param X0 unique designs
## ' @param Z0 averaged observations
## ' @param Z replicated observations (sorted with respect to X0)
## ' @param mult number of replicates at each Xi
## ' @param Delta vector of nuggets corresponding to each X0i or pXi, that are smoothed to give Lambda
## ' @param logN should exponentiated variance be used
## ' @param SiNK should the smoothing come from the SiNK predictor instead of the kriging one
## ' @param theta scale parameter for the mean process, either one value (isotropic) or a vector (anistropic)
## ' @param k_theta_g constant used for linking nuggets lengthscale to mean process lengthscale, i.e., theta_g[k] = k_theta_g * theta[k], alternatively theta_g can be used
## ' @param theta_g either one value (isotropic) or a vector (anistropic), alternative to using k_theta_g
## ' @param g nugget of the nugget process
## ' @param pX matrix of pseudo inputs locations of the noise process for Delta (could be replaced by a vector to avoid double loop)
## ' @param beta0 mean, if not provided, the MLE estimator is used
## ' @param eps minimal value of elements of Lambda
## ' @param covtype covariance kernel type
## ' @param penalty should a penalty term on Delta be used?
## ' @param hom_ll reference homoskedastic likelihood
## ' @export
logLikHet <- function(X0, Z0, Z, mult, Delta, theta, g, k_theta_g = NULL, theta_g = NULL, logN = FALSE, SiNK = FALSE,
                      beta0 = NULL, pX = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian", SiNK_eps = 1e-4,
                      penalty = T, hom_ll = NULL, env = NULL, trace = 0){
  n <- nrow(X0)
  N <- length(Z)
  
  if(is.null(theta_g))
    theta_g <- k_theta_g * theta
  
  if(is.null(pX)){
    Cg <- cov_gen(X1 = X0, theta = theta_g, type = covtype)
    Kg_c <- chol(Cg + diag(eps + g/mult))
    Kgi <- chol2inv(Kg_c)
    
    nmean <- drop(rowSums(Kgi) %*% Delta / sum(Kgi)) ## ordinary kriging mean
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = X0, theta_g = theta_g, g = g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * Cg %*% (Kgi %*% (Delta - nmean))
    }else{
      M <- Cg %*% (Kgi %*% (Delta - nmean))
    }
    
  }else{
    Cg <- cov_gen(X1 = pX, theta = theta_g, type = covtype)
    Kg_c <- chol(Cg + diag(eps + g/mult))
    Kgi <- chol2inv(Kg_c)
    
    kg <- cov_gen(X1 = X0, X2 = pX, theta = theta_g, type = covtype)
    
    nmean <- drop(rowSums(Kgi) %*% Delta / sum(Kgi)) ## ordinary kriging mean
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = pX, theta_g = theta_g, g = g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * kg %*% (Kgi %*% (Delta - nmean))
    }else{
      M <- kg %*% (Kgi %*% (Delta - nmean))
    }
  }
  
  Lambda <- drop(nmean + M)
  
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  
  LambdaN <- rep(Lambda, times = mult)
  
  # Temporarily store Cholesky transform of K in Ki
  C <- cov_gen(X1 = X0, theta = theta, type = covtype)
  if(!is.null(env)) env$C <- C
  Ki <- chol(C + diag(Lambda/mult + eps))
  ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
  Ki <- chol2inv(Ki)
  
  if(!is.null(env)){
    env$Cg <- Cg
    env$Kg_c <- Kg_c
    env$Kgi <- Kgi
    env$ldetKi <- ldetKi
    env$Ki <- Ki
  }
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- 1/N * (crossprod((Z - beta0)/LambdaN, Z - beta0) - crossprod((Z0 - beta0) * mult/Lambda, Z0 - beta0) + psi_0)
  
  loglik <- -N/2 * log(2*pi) - N/2 * log(psi) + 1/2 * ldetKi - 1/2 * sum((mult - 1) * log(Lambda) + log(mult)) - N/2
  
  if(penalty){
    nu_hat_var <- drop(crossprod(Delta - nmean, Kgi) %*% (Delta - nmean))/length(Delta)
    
    ## To avoid 0 variance, e.g., when Delta = nmean
    if(nu_hat_var < eps){
      return(loglik)
    } 
    
    # if(hardpenalty)
    #   return(loglik + min(0, - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2))
    
    pen <- - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2
    
    if(loglik < hom_ll && pen > 0){
      if(trace > 0) warning("Penalty is desactivated when unpenalized likelihood is lower than its homGP equivalent")
      return(loglik)
    }
    
    
    return(loglik + pen)
  }
  return(loglik)
}




## ' derivative of log-likelihood for logLikHet_Wood with respect to theta and Lambda with all observations
## ' Model: K = nu2 * (C + Lambda) = nu using all observations using the replicates information
## ' nu2 is replaced by its plugin estimator in the likelihood
## ' @param X0 unique designs
## ' @param Z0 averaged observations
## ' @param Z replicated observations (sorted with respect to X0)
## ' @param mult number of replicates at each Xi
## ' @param Delta vector of nuggets corresponding to each X0i or pXi, that are smoothed to give Lambda
## ' @param logN should exponentiated variance be used
## ' @param SiNK should the smoothing come from the SiNK predictor instead of the kriging one
## ' @param theta scale parameter for the mean process, either one value (isotropic) or a vector (anistropic)
## ' @param k_theta_g constant used for linking nuggets lengthscale to mean process lengthscale, i.e., theta_g[k] = k_theta_g * theta[k], alternatively theta_g can be used
## ' @param theta_g either one value (isotropic) or a vector (anistropic), alternative to using k_theta_g
## ' @param g nugget of the nugget process
## ' @param pX matrix of pseudo inputs locations of the noise process for Delta (could be replaced by a vector to avoid double loop)
## ' @param components to determine which variable are to be taken in the derivation:
## ' NULL for all, otherwise list with elements from 'theta', 'Delta', 'theta_g', 'k_theta_g', 'pX' and 'g'.
## ' @param beta0 mean, if not provided, the MLE estimator is used
## ' @param eps minimal value of elements of Lambda
## ' @param covtype covariance kernel type
## ' @param penalty should a penalty term on Delta be used?
## ' @param hom_ll: reference homoskedastic likelihood
## ' @export
dlogLikHet <- function(X0, Z0, Z, mult, Delta, theta, g, k_theta_g = NULL, theta_g = NULL, beta0 = NULL, pX = NULL,
                       logN = TRUE, SiNK = FALSE, components = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian", SiNK_eps = 1e-4,
                       penalty = T, hom_ll, env = NULL){
  
  ## Verifications
  if(is.null(k_theta_g) && is.null(theta_g))
    cat("Either k_theta_g or theta_g must be provided \n")
  
  ## Initialisations
  
  # Which terms need to be computed
  if(is.null(components)){
    components <- c(list("theta", "Delta", "g"))
    if(is.null(k_theta_g)){
      components <- c(components, list("theta_g"))
    }else{
      components <- c(components, list("k_theta_g"))
    }
    if(!is.null(pX))
      components <- c(components, list("pX"))
  }
  
  if(is.null(theta_g))
    theta_g <- k_theta_g * theta
  
  n <- nrow(X0)
  N <- length(Z)
  
  if(!is.null(env)){
    Cg <- env$Cg
    Kg_c <- env$Kg_c
    Kgi <- env$Kgi
    if(is.null(pX)) M <- add_diag(Kgi * (-eps - g / mult), rep(1, n))
  }else{
    if(is.null(pX)){
      Cg <- cov_gen(X1 = X0, theta = theta_g, type = covtype)
      Kg_c <- chol(Cg + diag(eps + g/mult))
      Kgi <- chol2inv(Kg_c)
      # M <- Cg %*% Kgi
      M <- add_diag(Kgi * (-eps - g / mult), rep(1, n))
    }else{
      Cg <- cov_gen(X1 = pX, theta = theta_g, type = covtype)
      Kg_c <- chol(Cg + diag(eps + g/mult))
      Kgi <- chol2inv(Kg_c)
      kg <- cov_gen(X1 = X0, X2 = pX, theta = theta_g, type = covtype)
    }
  }
  
  ## Precomputations for reuse
  
  rSKgi <- rowSums(Kgi)
  sKgi <- sum(Kgi)
  
  nmean <- drop(rSKgi %*% Delta / sKgi) ## ordinary kriging mean
  
  ## Precomputations for reuse
  KgiD <- Kgi %*% (Delta - nmean)
  
  if(SiNK){
    rhox <- 1 / rho_AN(xx = X0, X0 = pX, theta_g = theta_g, g = g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
    M <-  rhox * M
  }
  
  Lambda <- drop(nmean + M %*% (Delta - nmean))
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  # Lambda <- pmax(mult * eps, Lambda)
  
  LambdaN <- rep(Lambda, times = mult)
  
  if(!is.null(env)){
    C <- env$C
    Ki <- env$Ki
    ldetKi <- env$ldetKi
  }else{
    C <- cov_gen(X1 = X0, theta = theta, type = covtype)
    Ki <- chol(C + diag(Lambda/mult + eps))
    ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
    Ki <- chol2inv(Ki)
  }
  
  # Ki <- chol2inv(chol(C + diag(Lambda/mult + eps)))
  
  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  ## Precomputations for reuse
  KiZ0 <- Ki %*% (Z0 - beta0)
  rsM <- rowSums(M)
  
  psi_0 <- drop(crossprod(KiZ0, Z0 - beta0))
  
  psi <- drop((crossprod((Z - beta0)/LambdaN, Z - beta0) - crossprod((Z0 - beta0) * mult/Lambda, Z0 - beta0)) + psi_0)
  
  if(penalty){
    nu_hat_var <- drop(crossprod(KgiD, (Delta - nmean)))/length(Delta) 
    
    # To prevent numerical issues when Delta = nmean, resulting in divisions by zero
    if(nu_hat_var < eps){
      penalty <- FALSE
    }else{
      loglik <- -N/2 * log(2*pi) - N/2 * log(psi/N)  + 1/2 * ldetKi - 1/2 * sum((mult - 1) * log(Lambda) + log(mult)) - N/2
      pen <- - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2
      if(loglik < hom_ll && pen > 0) penalty <- FALSE
    }
    # if(nu_hat_var < eps || (hardpenalty && (- n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2) > 0)) penalty <- FALSE
  }       
  
  dLogL_dtheta <- dLogL_dDelta <- dLogL_dkthetag <- dLogL_dthetag <- dLogL_dg <- dLogL_dpX <- NULL
  
  # First component, derivative of logL with respect to theta
  if("theta" %in% components){
    dLogL_dtheta <- rep(NA, length(theta))
    for(i in 1:length(theta)){
      if(length(theta) == 1){
        dC_dthetak <- partial_cov_gen(X1 = X0, theta = theta, arg = "theta_k", type = covtype) * C # partial derivative of C with respect to theta
      }else{
        dC_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = theta[i], arg = "theta_k", type = covtype) * C # partial derivative of C with respect to theta
      }
      
      if("k_theta_g" %in% components){
        
        if(is.null(pX)){
          if(length(theta) == 1){
            dCg_dthetak <- partial_cov_gen(X1 = X0, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
          }else{
            dCg_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
          }
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == FALSE)
            dLdtk <- dCg_dthetak %*% KgiD - M %*% (dCg_dthetak %*% KgiD)
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == TRUE){
            Kgitkg <- t(M) # for reuse
            d_irho_dtheta_k <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dCg_dthetak, Kgitkg) - fast_diag(M, dCg_dthetak %*% Kgitkg) + fast_diag(M, t(dCg_dthetak)))
            dLdtk <- (d_irho_dtheta_k * kg + rhox * (dCg_dthetak - (M) %*% dCg_dthetak)) %*% KgiD
          }
        }else{
          if(length(theta == 1)){
            dCg_dthetak <- partial_cov_gen(X1 = pX, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
            dkg_dthetak <- partial_cov_gen(X1 = X0, X2 = pX, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * kg # partial derivative of kg with respect to theta[i]
          }else{
            dCg_dthetak <- partial_cov_gen(X1 = pX[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
            dkg_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], X2 = pX[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * kg # partial derivative of kg with respect to theta[i]
          }
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == FALSE)
            dLdtk <- dkg_dthetak %*% KgiD - M %*% (dCg_dthetak %*% KgiD)
          
          # Derivative Lambda / theta_k (first part)
          if(SiNK == TRUE){
            Kgitkg <- t(M) # for reuse
            d_irho_dtheta_k <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dthetak, Kgitkg) - fast_diag(M, dCg_dthetak %*% Kgitkg) + fast_diag(M, t(dkg_dthetak)))
            dLdtk <- (d_irho_dtheta_k * kg + rhox * (dkg_dthetak - (M) %*% dCg_dthetak)) %*% KgiD
          }
        }
        
        
        # (second part)
        dLdtk <- dLdtk - (1 - rsM) * drop(rSKgi %*% dCg_dthetak %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetak %*% rSKgi))/sKgi^2
        
        if(logN)
          dLdtk <- dLdtk * Lambda
        
        dK_dthetak <- add_diag(dC_dthetak, drop(dLdtk)/mult) # dK/dtheta[k]
        dLogL_dtheta[i] <- N/2 * (crossprod((Z - beta0)/LambdaN * rep(dLdtk, times = mult), (Z - beta0)/LambdaN) - crossprod((Z0 - beta0)/Lambda * mult * dLdtk, (Z0 - beta0)/Lambda) +
                                    crossprod(KiZ0, dK_dthetak) %*% KiZ0)/psi - 1/2 * trace_sym(Ki, dK_dthetak)
        dLogL_dtheta[i] <- dLogL_dtheta[i] - 1/2 * sum((mult - 1) * dLdtk/Lambda) # derivative of the sum(a_i - 1)log(lambda_i)
        
        if(penalty)
          dLogL_dtheta[i] <- dLogL_dtheta[i]  + 1/2 * crossprod(KgiD, dCg_dthetak) %*% KgiD / nu_hat_var  - 1/2 * trace_sym(Kgi, dCg_dthetak)
        
      }else{
        dLogL_dtheta[i] <- N/2 * crossprod(KiZ0, dC_dthetak) %*% KiZ0/psi - 1/2 * trace_sym(Ki, dC_dthetak)
      }
    }
  }
  
  # Derivative of logL with respect to Lambda
  if(any(c("Delta", "g", "k_theta_g", "theta_g", "pX") %in% components)){
    dLogLdLambda <- N/2 * ((fast_tUY2(mult, (Z - beta0)^2) - (Z0 - beta0)^2 * mult)/Lambda^2 + KiZ0^2/mult)/psi - (mult - 1)/(2*Lambda) - 1/(2*mult) * diag(Ki)
    
    if(logN){
      dLogLdLambda <- Lambda * dLogLdLambda
    }
  }
  
  
  ## Derivative of Lambda with respect to Delta
  if("Delta" %in% components)
    # dLogL_dDelta <- crossprod(M - tcrossprod(M, matrix(rep(rSKgi / sKgi, ncol(Kgi)), ncol(Kgi))) + matrix(rep(rSKgi / sKgi, nrow(M)), nrow(M), byrow = T), dLogLdLambda) #chain rule
    dLogL_dDelta <- crossprod(M, dLogLdLambda) + rSKgi/sKgi*sum(dLogLdLambda) -  rSKgi / sKgi * sum(crossprod(M, dLogLdLambda)) #chain rule
  
  # Derivative Lambda / k_theta_g
  if("k_theta_g" %in% components){
    if(is.null(pX)){
      dCg_dk <- partial_cov_gen(X1 = X0, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * Cg
      
      if(SiNK){
        d_irho_dkthetag <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dCg_dk, Kgitkg) - fast_diag(M, dCg_dk %*% Kgitkg) + fast_diag(M, t(dCg_dk)))
        dLogL_dkthetag <- (d_irho_dkthetag * kg + rhox*(dCg_dk - M %*% dCg_dk)) %*% KgiD -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }else{
        dLogL_dkthetag <- dCg_dk %*% KgiD - M %*% (dCg_dk %*% KgiD) -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }
    }else{
      dCg_dk <- partial_cov_gen(X1 = pX, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * Cg
      dkg_dk <- partial_cov_gen(X1 = X0, X2 = pX, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * kg
      
      if(SiNK){
        d_irho_dkthetag <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dk, Kgitkg) - fast_diag(M, dCg_dk %*% Kgitkg) + fast_diag(M, t(dkg_dk)))
        dLogL_dkthetag <- (d_irho_dkthetag * kg + rhox*(dkg_dk - M %*% dCg_dk)) %*% KgiD -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }else{
        dLogL_dkthetag <- dkg_dk %*% KgiD - M %*% (dCg_dk %*% KgiD) -
          (1 - rsM) * drop(rSKgi %*% dCg_dk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
      }
    }
    
    dLogL_dkthetag <- crossprod(dLogL_dkthetag, dLogLdLambda) ## chain rule
  }
  
  # Derivative Lambda / theta_g
  if("theta_g" %in% components){
    dLogL_dthetag <- rep(NA, length(theta_g))
    
    for(i in 1:length(theta_g)){
      if(is.null(pX)){
        
        if(length(theta_g) == 1){
          dCg_dthetagk <- partial_cov_gen(X1 = X0, theta = theta_g, arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
        }else{
          dCg_dthetagk <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
        }
        
        if(SiNK){
          d_irho_dtheta_gk <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dCg_dthetagk, Kgitkg) - fast_diag(M, dCg_dthetagk %*% Kgitkg) + fast_diag(M, t(dCg_dthetagk)))
          dLogL_dthetag[i] <- crossprod((d_irho_dtheta_gk * kg + rhox * (dCg_dthetagk - M %*% dCg_dthetagk)) %*% KgiD -
                                          (1 - rsM) * (rSKgi %*% dCg_dthetagk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }else{
          dLogL_dthetag[i] <- crossprod(dCg_dthetagk %*% KgiD - M %*% (dCg_dthetagk %*% KgiD) -
                                          (1 - rsM) * drop(rSKgi %*% dCg_dthetagk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }
      }else{
        if(length(theta_g) == 1){
          dCg_dthetagk <- partial_cov_gen(X1 = pX, theta = theta_g, arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
          dkg_dthetagk <- partial_cov_gen(X1 = X0, X2 = pX, theta = theta_g, arg = "theta_k", type = covtype) * kg # partial derivative of Cg with respect to theta
        }else{
          dCg_dthetagk <- partial_cov_gen(X1 = pX[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
          dkg_dthetagk <- partial_cov_gen(X1 = X0[, i, drop = FALSE], X2 = pX[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * kg # partial derivative of Cg with respect to theta
        }
        
        if(SiNK){
          d_irho_dtheta_gk <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dthetagk, Kgitkg) - fast_diag(M, dCg_dthetagk %*% Kgitkg) + fast_diag(M, t(dkg_dthetagk)))
          dLogL_dthetag[i] <- crossprod((d_irho_dtheta_gk * kg + rhox * (dkg_dthetagk - M %*% dCg_dthetagk)) %*% KgiD -
                                          (1 - rsM) * (rSKgi %*% dCg_dthetagk %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }else{
          dLogL_dthetag[i] <- crossprod(dkg_dthetagk %*% KgiD - M %*% (dCg_dthetagk %*% KgiD) -
                                          (1 - rsM) * drop(rSKgi %*% dCg_dthetagk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
        }
      }
      
      # Penalty term
      if(penalty) dLogL_dthetag[i] <- dLogL_dthetag[i] + 1/2 * crossprod(KgiD, dCg_dthetagk) %*% KgiD/nu_hat_var - trace_sym(Kgi, dCg_dthetagk)/2 
    }
  }
  
  
  ## Derivative Lambda / g
  if("g" %in% components){
    if(SiNK){
      A0 <- diag(1/mult, n)
      d_irho_dg <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * fast_diag(-M %*% A0, Kgitkg)
      dLogL_dg <- crossprod((d_irho_dg * M - rhox * M %*% A0 %*% Kgi) %*% (Delta - nmean) - (1 - rsM) * (rowSums(Kgi %*% A0 %*% Kgi) %*% Delta * sKgi - rSKgi %*% Delta * sum(rSKgi^2/mult))/sKgi^2, dLogLdLambda) #chain rule
    }else{
      dLogL_dg <- crossprod(-M %*% (KgiD/mult) - (1 - rsM) * drop(Delta %*% (Kgi %*% (rSKgi/mult)) * sKgi - rSKgi %*% Delta * sum(rSKgi^2/mult))/sKgi^2, dLogLdLambda) #chain rule
    }
  }
  
  
  # Derivative Lambda/pX
  if("pX" %in% components){
    dLogL_dpX <- rep(NA, length(pX))
    for(i in 1:nrow(pX)){
      for(j in 1:ncol(pX)){
        dCg_dpX <- partial_cov_gen(X1 = pX, theta = theta_g, i1 = i, i2 = j, arg = "X_i_j", type = covtype) * Cg
        dkg_dpX <- t(partial_cov_gen(X1 = pX, X2 = X0, theta = theta_g, i1 = i, i2 = j, arg = "X_i_j", type = covtype)) * kg
        
        if(SiNK){
          d_irho_dX_i_j <- -1/2 * (fast_diag(M, t(kg)))^(-3/2) * (fast_diag(dkg_dpX, Kgitkg) - fast_diag(M, dCg_dpX %*% Kgitkg) + fast_diag(M, t(dkg_dpX)))
          dLogL_dpX[(j-1)*nrow(pX) + i] <- crossprod((d_irho_dX_i_j * kg + rhox * (dkg_dpX - M %*% dCg_dpX)) %*% KgiD -
                                                       (1 - rsM) * (rSKgi %*% dCg_dpX %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dpX %*% rSKgi))/sKgi^2, dLogLdLambda)
        }else{
          dLogL_dpX[(j-1)*nrow(pX) + i] <- crossprod((dkg_dpX - M %*% dCg_dpX) %*% KgiD -
                                                       (1 - rsM) * (rSKgi %*% dCg_dpX %*% Kgi %*% Delta * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dpX %*% rSKgi))/sKgi^2, dLogLdLambda)
        }
        
        if(penalty) dLogL_dpX[(j-1)*nrow(pX) + i] <- dLogL_dpX[(j-1)*nrow(pX) + i] - 1/2 * crossprod(KgiD, dCg_dpX) %*% KgiD / nu_hat_var - trace_sym(Kgi, dCg_dpX)/2 
        
      }
    }
  }
  
  # Additional penalty terms on Delta
  if(penalty){
    if("Delta" %in% components){
      dLogL_dDelta <- dLogL_dDelta - KgiD / nu_hat_var
    }
    if("k_theta_g" %in% components){
      dLogL_dkthetag <- dLogL_dkthetag + 1/2 * crossprod(KgiD, dCg_dk) %*% KgiD / nu_hat_var - trace_sym(Kgi, dCg_dk)/2 
    }
    if("g" %in% components){
      dLogL_dg <- dLogL_dg + 1/2 * crossprod(KgiD/mult, KgiD) / nu_hat_var - sum(diag(Kgi)/mult)/2 
    }
    
  }
  
  return(c(dLogL_dtheta,
           dLogL_dDelta,
           dLogL_dkthetag,
           dLogL_dthetag,
           dLogL_dg,
           dLogL_dpX))
  
}


#' Compare two models based on the log-likelihood for \code{hetGP} and \code{homGP} models
#' @title Likelihood-based comparison of models
#' @param model1,model2 \code{hetGP} or \code{homGP} models
#' @return Best model based on the likelihood, first one in case of a tie
#' @note If comparing homoskedastic and heteroskedastic models, the un-penalised likelihood is used for the later, see e.g., (Binois et al. 2017+).
#' @export
#' @references
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902.
compareGP <- function(model1, model2){
  if(is(model1, "hetGP")) ll1 <- model1$ll_non_pen
  else ll1 <- model1$ll
  
  if(is(model2, "hetGP")) ll2 <- model2$ll_non_pen
  else ll2 <- model2$ll
  
  if(ll1 >= ll2) return(model1)
  return(model2)
}


#' @title Gaussian process modeling with heteroskedastic noise
#' @description 
#' Gaussian process regression under input dependent noise based on maximum likelihood estimation of the hyperparameters. 
#' A second GP is used to model latent (log-) variances. 
#' This function is enhanced to deal with replicated observations.
#' @param X matrix of all designs, one per row, or list with elements:
#' \itemize{
#'   \item \code{X0} matrix of unique design locations, one point per row
#'   \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
#'   \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
#' } 
#' @param Z vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}
#' @param lower,upper optional bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
#' In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy)
#' @param noiseControl list with elements related to optimization of the noise process parameters:
#' \itemize{
#' \item \code{g_min}, \code{g_max} minimal and maximal noise to signal ratio (of the mean process)
#' \item \code{lowerDelta}, \code{upperDelta} optional vectors (or scalars) of bounds on \code{Delta}, of length \code{nrow(X0)} (default to \code{rep(eps, nrow(X0))} and \code{rep(noiseControl$g_max, nrow(X0))} resp., or their \code{log}) 
## ' \item lowerpX, upperpX optional vectors of bounds of the input domain if pX is used.
#' \item \code{lowerTheta_g}, \code{upperTheta_g} optional vectors of bounds for the lengthscales of the noise process if \code{linkThetas == 'none'}.
#' Same as for \code{theta} if not provided.
#' \item \code{k_theta_g_bounds} if \code{linkThetas == 'joint'}, vector with minimal and maximal values for \code{k_theta_g} (default to \code{c(1, 100)}). See Details.
#' \item \code{g_bounds} vector for minimal and maximal noise to signal ratios for the noise of the noise process, i.e., the smoothing parameter for the noise process.
#' (default to \code{c(1e-6, 1)}).
#'}
#' @param settings list for options about the general modeling procedure, with elements:
#' \itemize{
#'   \item \code{linkThetas} defines the relation between lengthscales of the mean and noise processes.
#'   Either \code{'none'}, \code{'joint'}(default) or \code{'constr'}, see Details.
#'   \item \code{logN}, when \code{TRUE} (default), the log-noise process is modeled.
#'   \item \code{initStrategy} one of \code{'simple'}, \code{'residuals'} (default) and \code{'smoothed'} to obtain starting values for \code{Delta}, see Details
#'   \item \code{penalty} when \code{TRUE}, the penalized version of the likelihood is used (i.e., the sum of the log-likelihoods of the mean and variance processes, see References).
## '   \item \code{hardpenalty} is \code{TRUE}, the log-likelihood from the noise GP is taken into account only if negative (default if \code{maxit > 1000}).
#'   \item \code{checkHom} when \code{TRUE}, if the log-likelihood with a homoskedastic model is better, then return it.
#'   \item \code{trace} optional scalar (default to \code{0}). If positive, tracing information on the fitting process.
#' If \code{1}, information is given about the result of the heterogeneous model optimization.
#' Level \code{2} gives more details. Level \code{3} additionaly displays all details about initialization of hyperparameters.
#' \item \code{return.matrices} boolean to include the inverse covariance matrix in the object for further use (e.g., prediction).
#' \item \code{return.hom} boolean to include homoskedastic GP models used for initialization (i.e., \code{modHom} and \code{modNugs}).
#' \item \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}}.   
#' }
#' @param eps jitter used in the inversion of the covariance matrix for numerical stability
#' @param init,known optional lists of starting values for mle optimization or that should not be optimized over, respectively.
#' Values in \code{known} are not modified, while it can happen to these of \code{init}, see Details. 
#' One can set one or several of the following:
#' \itemize{
#' \item \code{theta} lengthscale parameter(s) for the mean process either one value (isotropic) or a vector (anistropic)
#' \item \code{Delta} vector of nuggets corresponding to each design in \code{X0}, that are smoothed to give \code{Lambda}
#' (as the global covariance matrix depend on \code{Delta} and \code{nu_hat}, it is recommended to also pass values for \code{theta})
#' \item \code{beta0} constant trend of the mean process
#' \item \code{k_theta_g} constant used for link mean and noise processes lengthscales, when \code{settings$linkThetas == 'joint'}
#' \item \code{theta_g} either one value (isotropic) or a vector (anistropic) for lengthscale parameter(s) of the noise process, when \code{settings$linkThetas != 'joint'}
#' \item \code{g} scalar nugget of the noise process
#' \item \code{g_H} scalar homoskedastic nugget for the initialisation with a \code{\link[hetGP]{mleHomGP}}. See Details.
## '\item pX matrix of fixed pseudo inputs locations of the noise process corresponding to Delta
#' }
#' @param covtype covariance kernel type, either \code{'Gaussian'}, \code{'Matern5_2'} or \code{'Matern3_2'}, see \code{\link[hetGP]{cov_gen}}
#' @param maxit maximum number of iterations for \code{L-BFGS-B} of \code{\link[stats]{optim}} dedicated to maximum likelihood optimization
#' 
#' @details
#' 
#' The global covariance matrix of the model is parameterized as \code{nu_hat * (C + Lambda * diag(1/mult)) = nu_hat * K},
#' with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices) and values of lengthscale parameters.
#' \code{nu_hat} is the plugin estimator of the variance of the process.
#' \code{Lambda} is the prediction on the noise level given by a second (homoskedastic) GP: \cr
#' \deqn{\Lambda = C_g(C_g + \mathrm{diag}(g/\mathrm{mult}))^{-1} \Delta} \cr
#' with \code{C_g} the correlation matrix between unique designs for this second GP, with lengthscales hyperparameters \code{theta_g} and nugget \code{g}
#' and \code{Delta} the variance level at \code{X0} that are estimated.
#' 
#' It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.
#' 
#' The noise process lengthscales can be set in several ways:
#' \itemize{
#' \item using \code{k_theta_g} (\code{settings$linkThetas == 'joint'}), supposed to be greater than one by default. 
#' In this case lengthscales of the noise process are multiples of those of the mean process.
#' \item if \code{settings$linkThetas == 'constr'}, then the lower bound on \code{theta_g} correspond to estimated values of an homoskedastic GP fit.
#' \item else lengthscales between the mean and noise process are independent (both either anisotropic or not).
#' }
#'
#' When no starting nor fixed parameter values are provided with \code{init} or \code{known}, 
#' the initialization process consists of fitting first an homoskedastic model of the data, called \code{modHom}.
#' Unless provided with \code{init$theta}, initial lengthscales are taken at 10\% of the range determined with \code{lower} and \code{upper},
#' while \code{init$g_H} may be use to pass an initial nugget value.
#' The resulting lengthscales provide initial values for \code{theta} (or update them if given in \code{init}). \cr \cr
#' If necessary, a second homoskedastic model, \code{modNugs}, is fitted to the empirical residual variance between the prediction
#'  given by \code{modHom} at \code{X0} and \code{Z} (up to \code{modHom$nu_hat}).
#' Note that when specifying \code{settings$linkThetas == 'joint'}, then this second homoskedastic model has fixed lengthscale parameters.
#' Starting values for \code{theta_g} and \code{g} are extracted from \code{modNugs}.\cr \cr
#' Finally, three initialization schemes for \code{Delta} are available with \code{settings$initStrategy}: 
#' \itemize{
#' \item for \code{settings$initStrategy == 'simple'}, \code{Delta} is simply initialized to the estimated \code{g} value of \code{modHom}. 
#' Note that this procedure may fail when \code{settings$penalty == TRUE}.
#' \item for \code{settings$initStrategy == 'residuals'}, \code{Delta} is initialized to the estimated residual variance from the homoskedastic mean prediction.
#' \item for \code{settings$initStrategy == 'smoothed'}, \code{Delta} takes the values predicted by \code{modNugs} at \code{X0}.
#' }
#'
#' Notice that \code{lower} and \code{upper} bounds cannot be equal for \code{\link[stats]{optim}}.
## ' To use pseudo-input locations for the noise process, one can either provide pX if they are not to be optimized.
## ' Otherwise, initial values are given with pXinit, and optimization bounds with lowerpX, upperpX in init.
## ' Automatic initialization of the other parameters without restriction is available for now only with method 'simple',
## ' otherwise it is assumed that pXinit points are a subset of X0.
#'
#' @return a list which is given the S3 class \code{"hetGP"}, with elements:
#' \itemize{
#' \item \code{theta}: unless given, maximum likelihood estimate (mle) of the lengthscale parameter(s),
#' \item \code{Delta}: unless given, mle of the nugget vector (non-smoothed),
#' \item \code{Lambda}: predicted input noise variance at \code{X0}, 
#' \item \code{nu_hat}: plugin estimator of the variance,
#' \item \code{theta_g}: unless given, mle of the lengthscale(s) of the noise/log-noise process,
#' \item \code{k_theta_g}: if \code{settings$linkThetas == 'joint'}, mle for the constant by which lengthscale parameters of \code{theta} are multiplied to get \code{theta_g},
#' \item \code{g}: unless given, mle of the nugget of the noise/log-noise process,
#' \item \code{trendtype}: either "\code{SK}" if \code{beta0} is provided, else "\code{OK}",
#' \item \code{beta0} constant trend of the mean process, plugin-estimator unless given,
#' \item \code{nmean}: plugin estimator for the constant noise/log-noise process mean,
## ' \item \code{pX}: if used, matrix of pseudo-inputs locations for the noise/log-noise process,
#' \item \code{ll}: log-likelihood value, (\code{ll_non_pen}) is the value without the penalty,
#' \item \code{nit_opt}, \code{msg}: \code{counts} and \code{message} returned by \code{\link[stats]{optim}}
#' \item \code{modHom}: homoskedastic GP model of class \code{homGP} used for initialization of the mean process,
#' \item \code{modNugs}: homoskedastic GP model of class \code{homGP} used for initialization of the noise/log-noise process,
#' \item \code{nu_hat_var}: variance of the noise process,
#' \item \code{used_args}: list with arguments provided in the call to the function, which is saved in \code{call},
#' \item \code{Ki}, \code{Kgi}: inverse of the covariance matrices of the mean and noise processes (not scaled by \code{nu_hat} and \code{nu_hat_var}),  
#' \item \code{X0}, \code{Z0}, \code{Z}, \code{eps}, \code{logN}, \code{covtype}: values given in input,
#' \item \code{time}: time to train the model, in seconds.
#'}
#' @seealso \code{\link[hetGP]{predict.hetGP}} for predictions, \code{\link[hetGP]{update.hetGP}} for updating an existing model.
#' \code{summary} and \code{plot} functions are available as well.
#' \code{\link[hetGP]{mleHetTP}} provide a Student-t equivalent.
#' @references 
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902. \cr \cr
#' @export
#' @importFrom stats optim var
#' @import methods
#' @examples 
#' ##------------------------------------------------------------
#' ## Example 1: Heteroskedastic GP modeling on the motorcycle data
#' ##------------------------------------------------------------
#' set.seed(32)
#' 
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' nvar <- 1
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' 
#' 
#' ## Model fitting
#' model <- mleHetGP(X = X, Z = Z, lower = rep(0.1, nvar), upper = rep(50, nvar),
#'                   covtype = "Matern5_2")
#'             
#' ## Display averaged observations
#' points(model$X0, model$Z0, pch = 20)
#' 
#' ## A quick view of the fit                  
#' summary(model)
#' 
#' ## Create a prediction grid and obtain predictions
#' xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
#' predictions <- predict(x = xgrid, object =  model)
#' 
#' ## Display mean predictive surface
#' lines(xgrid, predictions$mean, col = 'red', lwd = 2)
#' ## Display 95% confidence intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2)), col = 2, lty = 2)
#' ## Display 95% prediction intervals
#' lines(xgrid, qnorm(0.05, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)
#' lines(xgrid, qnorm(0.95, predictions$mean, sqrt(predictions$sd2 + predictions$nugs)), 
#'   col = 3, lty = 2)
#' 
#' ##------------------------------------------------------------
#' ## Example 2: 2D Heteroskedastic GP modeling
#' ##------------------------------------------------------------
#' set.seed(1)
#' nvar <- 2
#'   
#' ## Branin redefined in [0,1]^2
#' branin <- function(x){
#'   if(is.null(nrow(x)))
#'     x <- matrix(x, nrow = 1)
#'     x1 <- x[,1] * 15 - 5
#'     x2 <- x[,2] * 15
#'     (x2 - 5/(4 * pi^2) * (x1^2) + 5/pi * x1 - 6)^2 + 10 * (1 - 1/(8 * pi)) * cos(x1) + 10
#' }
#' 
#' ## Noise field via standard deviation
#' noiseFun <- function(x){
#'   if(is.null(nrow(x)))
#'     x <- matrix(x, nrow = 1)
#'   return(1/5*(3*(2 + 2*sin(x[,1]*pi)*cos(x[,2]*3*pi) + 5*rowSums(x^2))))
#' }
#' 
#' ## data generating function combining mean and noise fields
#' ftest <- function(x){
#'   return(branin(x) + rnorm(nrow(x), mean = 0, sd = noiseFun(x)))
#' }
#' 
#' ## Grid of predictive locations
#' ngrid <- 51
#' xgrid <- matrix(seq(0, 1, length.out = ngrid), ncol = 1) 
#' Xgrid <- as.matrix(expand.grid(xgrid, xgrid))
#' 
#' ## Unique (randomly chosen) design locations
#' n <- 50
#' Xu <- matrix(runif(n * 2), n)
#' 
#' ## Select replication sites randomly
#' X <- Xu[sample(1:n, 20*n, replace = TRUE),]
#' 
#' ## obtain training data response at design locations X
#' Z <- ftest(X)
#' 
#' ## Formating of data for model creation (find replicated observations) 
#' prdata <- find_reps(X, Z, rescale = FALSE, normalize = FALSE)
#'
#' ## Model fitting
#' model <- mleHetGP(X = list(X0 = prdata$X0, Z0 = prdata$Z0, mult = prdata$mult), Z = prdata$Z,
#'                   lower = rep(0.01, nvar), upper = rep(10, nvar),
#'                   covtype = "Matern5_2")
#'
#' ## a quick view into the data stored in the "hetGP"-class object
#' summary(model)                  
#'              
#' ## prediction from the fit on the grid     
#' predictions <- predict(x = Xgrid, object =  model)
#' 
#' ## Visualization of the predictive surface
#' par(mfrow = c(2, 2))
#' contour(x = xgrid,  y = xgrid, z = matrix(branin(Xgrid), ngrid), 
#'   main = "Branin function", nlevels = 20)
#' points(X, col = 'blue', pch = 20)
#' contour(x = xgrid,  y = xgrid, z = matrix(predictions$mean, ngrid), 
#'   main = "Predicted mean", nlevels = 20)
#' points(Xu, col = 'blue', pch = 20)
#' contour(x = xgrid,  y = xgrid, z = matrix(noiseFun(Xgrid), ngrid), 
#'   main = "Noise standard deviation function", nlevels = 20)
#' points(Xu, col = 'blue', pch = 20)
#' contour(x = xgrid,  y= xgrid, z = matrix(sqrt(predictions$nugs), ngrid), 
#'   main = "Predicted noise values", nlevels = 20)
#' points(Xu, col = 'blue', pch = 20)
#' par(mfrow = c(1, 1))
##
mleHetGP <- function(X, Z, lower = NULL, upper = NULL,
                     noiseControl = list(k_theta_g_bounds = c(1, 100), g_max = 1e2, g_bounds = c(1e-6, 1)),
                     settings = list(linkThetas = 'joint', logN = TRUE, initStrategy = 'residuals', checkHom = TRUE,
                                     penalty = TRUE, trace = 0, return.matrices = TRUE, return.hom = FALSE, factr = 1e9), 
                     covtype = c("Gaussian", "Matern5_2", "Matern3_2"), maxit = 100, known = NULL, init = NULL, eps = sqrt(.Machine$double.eps)){
  
  if(typeof(X)=="list"){
    X0 <- X$X0
    Z0 <- X$Z0
    mult <- X$mult
    if(sum(mult) != length(Z)) stop("Length(Z) should be equal to sum(mult)")
    if(is.null(dim(X0))) X0 <- matrix(X0, ncol = 1)
    if(length(Z0) != nrow(X0)) stop("Dimension mismatch between Z0 and X0")
  }else{
    if(is.null(dim(X))) X <- matrix(X, ncol = 1)
    if(nrow(X) != length(Z)) stop("Dimension mismatch between Z and X")
    elem <- find_reps(X, Z, return.Zlist = F)
    X0 <- elem$X0
    Z0 <- elem$Z0
    Z <- elem$Z
    mult <- elem$mult
  }
  
  covtype <- match.arg(covtype)
  
  if(is.null(lower) || is.null(upper)){
    auto_thetas <- auto_bounds(X = X0, covtype = covtype)
    if(is.null(lower)) lower <- auto_thetas$lower
    if(is.null(upper)) upper <- auto_thetas$upper
  }
  
  if(length(lower) != length(upper)) stop("upper and lower should have the same size")
  
  ## Save time to train model
  tic <- proc.time()[3]
  
  ## Initial checks
  
  n <- nrow(X0)
  
  if(is.null(n))
    stop("X0 should be a matrix. \n")
  
  jointThetas <- constrThetas <- FALSE
  if(!is.null(known$theta_g)) settings$linkThetas <- FALSE
  
  if(is.null(settings$linkThetas)){
    jointThetas <- TRUE
  }else{
    if(settings$linkThetas == 'joint')
      jointThetas <- TRUE
    if(settings$linkThetas == 'constr')
      constrThetas <- TRUE
  }
  
  logN <- TRUE
  if(!is.null(settings$logN)){
    logN <- settings$logN
  }
  
  if(is.null(settings$return.matrices))
    settings$return.matrices <- TRUE
  
  if(is.null(settings$return.hom))
    settings$return.hom <- FALSE
  
  if(jointThetas && is.null(noiseControl$k_theta_g_bounds))
    noiseControl$k_theta_g_bounds <- c(1, 100)
  
  if(is.null(settings$initStrategy))
    settings$initStrategy <- 'residuals'
  
  if(is.null(settings$factr))
    settings$factr <- 1e9
  
  penalty <- TRUE
  if(!is.null(settings$penalty))
    penalty <- settings$penalty
  
  # if(!is.null(settings$hardpenalty)){
  #   hardpenalty <- settings$hardpenalty
  # }else{
  #   if(maxit < 1000) hardpenalty <- FALSE else hardpenalty <- TRUE
  # }
  
  
  if(is.null(settings$checkHom))
    settings$checkHom <- TRUE
  
  trace <- 0
  if(!is.null(settings$trace))
    trace <- settings$trace
  
  components <- NULL
  if(is.null(known[["theta"]])){
    components <- c(components, list("theta"))
  }else{
    init$theta <- known[["theta"]]
  }
  
  if(is.null(known$Delta)){
    components <- c(components, list("Delta"))
  }else{
    init$Delta <- known$Delta
  }
  
  if(jointThetas){
    if(is.null(known$k_theta_g)){
      components <- c(components, list("k_theta_g"))
    }else{
      init$k_theta_g <- known$k_theta_g
    }
  }
  
  if(!jointThetas && is.null(known$theta_g)){
    components <- c(components, list("theta_g"))
  }else{
    if(!jointThetas) init$theta_g <- known$theta_g
  }
  
  if(is.null(known$g)){
    components <- c(components, list("g"))
  }else{
    init$g <- known$g
  }
  
  if(!is.null(init$pX)){
    components <- c(components, list("pX"))
    idcs_pX <- which(duplicated(rbind(init$pX, X0))) - nrow(init$pX) ## Indices of starting points in pX
  }else{
    if(!is.null(known$pX)){
      init$pX <- known$pX
    }else{
      init$pX <- NULL
    }
  }
  
  if(penalty && "pX" %in% components){
    penalty <- FALSE
    warning("Penalty not available with pseudo-inputs for now")
  }
  
  trendtype <- 'OK'
  
  if(!is.null(known$beta0)){
    trendtype <- 'SK'
    # beta0 <- known$beta0
  }
  
  if(is.null(noiseControl$g_bounds)){
    noiseControl$g_bounds <- c(1e-6, 1)
  }
  
  if(is.null(components) && is.null(known$theta_g)){
    known$theta_g <- known$k_theta_g * known$theta
  }
  
  ## Advanced option Single Nugget Kriging model for the noise process
  if(!is.null(noiseControl$SiNK) && noiseControl$SiNK){
    SiNK <- TRUE
    if(is.null(noiseControl$SiNK_eps)){
      SiNK_eps <- 1e-4
    }else{
      SiNK_eps <- noiseControl$SiNK_eps
    }
  }else{
    SiNK <- FALSE
  } 
  
  
  ### Automatic Initialisation
  modHom <- modNugs <- NULL
  if(is.null(init[["theta"]]) || is.null(init$Delta)){
    ## A) homoskedastic mean process
    
    if(!is.null(known$g_H)){
      g_init <- NULL
    }else{
      g_init <- init$g_H
      ## Initial value for g of the homoskedastic process: based on the mean variance at replicates compared to the variance of Z0
      if(any(mult > 5)){
        mean_var_replicates <- mean((fast_tUY2(mult, (Z - rep(Z0, times = mult))^2)/mult)[which(mult > 5)])
        
        if(is.null(g_init)) g_init <- mean_var_replicates/var(Z0) 
        if(is.null(noiseControl$g_max))
          noiseControl$g_max <- max(1e2, 100 * g_init)
        if(is.null(noiseControl$g_min))
          noiseControl$g_min <- eps
        
      }else{
        if(is.null(g_init)) g_init <- 0.1
        
        if(is.null(noiseControl$g_max))
          noiseControl$g_max <- 1e2
        
        if(is.null(noiseControl$g_min))
          noiseControl$g_min <- eps
        
      }
      
    }
    
    # if(is.null(init[["theta"]]))
    #   init$theta <- sqrt(upper * lower)
    
    if(settings$checkHom){
      rKI <- TRUE #return.Ki
    }else{
      rKI <- FALSE
    } 
    modHom <- mleHomGP(X = list(X0 = X0, Z0 = Z0, mult = mult), Z = Z, lower = lower,
                       known = list(theta = known[["theta"]], g = known$g_H, beta0 = known$beta0),
                       upper = upper, init = list(theta = init$theta, g = g_init), covtype = covtype, maxit = maxit,
                       noiseControl = list(g_bounds = c(noiseControl$g_min, noiseControl$g_max)), eps = eps,
                       settings = list(return.Ki = rKI))
    
    if(is.null(known[["theta"]]))
      init$theta <- modHom$theta
    
    if(is.null(init$Delta)){
      predHom <- suppressWarnings(predict(x = X0, object = modHom)$mean)
      nugs_est <- (Z - rep(predHom, times = mult))^2 #squared deviation from the homoskedastic prediction mean to the actual observations
      nugs_est <-  nugs_est / modHom$nu_hat  # to be homegeneous with Delta
      
      if(logN){
        nugs_est <- pmax(nugs_est, .Machine$double.eps) # to avoid problems on deterministic test functions
        nugs_est <- log(nugs_est)
      }
      
      nugs_est0 <- drop(fast_tUY2(mult, nugs_est))/mult # average
      
    }else{
      nugs_est0 <- init$Delta
    }
    
    if(constrThetas){
      noiseControl$lowerTheta_g <- modHom$theta
    }
    
    if(settings$initStrategy == 'simple'){
      if(logN){
        init$Delta <- rep(log(modHom$g), nrow(X0))
      }else{
        init$Delta <- rep(modHom$g, nrow(X0))
      }
    }
    if(settings$initStrategy == 'residuals')
      init$Delta <- nugs_est0
    
  }
  
  
  if((is.null(init$theta_g) && is.null(init$k_theta_g)) || is.null(init$g)){
    
    ## B) Homegeneous noise process
    if(jointThetas){
      if(is.null(init$k_theta_g)){
        init$k_theta_g <- 1
        init$theta_g <- init$theta
      }else{
        init$theta_g <- init$k_theta_g * init$theta
      }
      
      if(is.null(noiseControl$lowerTheta_g)){
        noiseControl$lowerTheta_g <- init$theta_g - eps
      }
      
      if(is.null(noiseControl$upperTheta_g)){
        noiseControl$upperTheta_g <- init$theta_g + eps
      }
    }
    
    if(!jointThetas && is.null(init$theta_g)){
      init$theta_g <- init$theta
    }
    
    if(is.null(noiseControl$lowerTheta_g)){
      noiseControl$lowerTheta_g <- lower
    }
    
    if(is.null(noiseControl$upperTheta_g)){
      noiseControl$upperTheta_g <- upper
    }
    
    ## If an homegeneous process of the mean has already been computed, it is used for estimating the parameters of the noise process
    if(exists("nugs_est")){
      
      if(is.null(init$g)){
        mean_var_replicates_nugs <- mean((fast_tUY2(mult, (nugs_est - rep(nugs_est0, times = mult))^2)/mult))
        init$g <- mean_var_replicates_nugs / var(nugs_est0)
      }
      
      modNugs <- mleHomGP(X = list(X0 = X0, Z0 = nugs_est0, mult = mult), Z = nugs_est,
                          lower = noiseControl$lowerTheta_g, upper = noiseControl$upperTheta_g,
                          init = list(theta = init$theta_g, g =  init$g), covtype = covtype, noiseControl = noiseControl,
                          maxit = maxit, eps = eps, settings = list(return.Ki = F))
      prednugs <- suppressWarnings(predict(x = X0, object = modNugs))
      
    }else{
      if(!exists("nugs_est0")) nugs_est0 <- init$Delta
      
      if(is.null(init$g)){
        init$g <- 0.05
      }
      
      modNugs <- mleHomGP(X = list(X0 = X0, Z0 = nugs_est0, mult = rep(1, nrow(X0))), Z = nugs_est0,
                          lower = noiseControl$lowerTheta_g, upper = noiseControl$upperTheta_g,
                          init = list(theta = init$theta_g, g =  init$g), covtype = covtype, noiseControl = noiseControl,
                          maxit = maxit, eps = eps, settings = list(return.Ki = F))
      prednugs <- suppressWarnings(predict(x = X0, object = modNugs))
    }
    
    if(settings$initStrategy == 'smoothed')
      init$Delta <- prednugs$mean
    
    if(is.null(known$g))
      init$g <- modNugs$g
    
    if(jointThetas && is.null(init$k_theta_g))
      init$k_theta_g <- 1
    
    if(!jointThetas && is.null(init$theta_g))
      init$theta_g <- modNugs$theta
    
  }
  
  if(is.null(noiseControl$lowerTheta_g)){
    noiseControl$lowerTheta_g <- lower
  }
  
  if(is.null(noiseControl$upperTheta_g)){
    noiseControl$upperTheta_g <- upper
  }
  
  ### Start of optimization of the log-likelihood
  fn <- function(par, X0, Z0, Z, mult, Delta = NULL, theta = NULL, g = NULL, k_theta_g = NULL, theta_g = NULL, logN = FALSE, SiNK = FALSE,
                 beta0 = NULL, pX = NULL, hom_ll, env){
    
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      idx <- length(init$theta)
      theta <- par[1:idx]
      idx <- idx + 1
    }
    if(is.null(Delta)){
      Delta <- par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
    }
    if(jointThetas && is.null(k_theta_g)){
      k_theta_g <- par[idx]
      idx <- idx + 1
    }
    if(!jointThetas && is.null(theta_g)){
      theta_g <- par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
    }
    if(is.null(g)){
      g <- par[idx]
      idx <- idx + 1
    }
    if(idx != (length(par) + 1))
      pX <- matrix(par[idx:length(par)], ncol = ncol(X0))
    
    loglik <- logLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
                        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX, covtype = covtype, eps = eps, SiNK_eps = SiNK_eps, penalty = penalty,
                        hom_ll = hom_ll, env = env, trace = trace)
    
    if(!is.null(env) && !is.na(loglik)){
      if(is.null(env$max_loglik) || loglik > env$max_loglik){
        env$max_loglik <- loglik
        env$arg_max <- par
      }
    } 
    
    return(loglik)
  }
  
  
  gr <- function(par, X0, Z0, Z, mult, Delta = NULL, theta = NULL, g = NULL, k_theta_g = NULL, theta_g = NULL, logN = FALSE, SiNK = FALSE,
                 beta0 = NULL, pX = NULL, hom_ll, env){
    
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
    }
    if(is.null(Delta)){
      Delta <- par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
    }
    if(jointThetas && is.null(k_theta_g)){
      k_theta_g <- par[idx]
      idx <- idx + 1
    }
    if(!jointThetas && is.null(theta_g)){
      theta_g <- par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
    }
    if(is.null(g)){
      g <- par[idx]
      idx <- idx + 1
    }
    if(idx != (length(par) + 1))
      pX <- matrix(par[idx:length(par)], ncol = ncol(X0))
    
    # c(grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, x = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, x = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, x = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, x = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX)
    # )
    # grad(fn, x = par, X0 = X0, Z0 = Z0, Z = Z, mult = mult, logN = logN, SiNK = SiNK, method.args=list(r = 6))
    
    return(dlogLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
                      logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX, components = components, covtype = covtype, eps = eps, SiNK_eps = SiNK_eps,
                      penalty = penalty, hom_ll = hom_ll, env = env))
    
  }
  
  ## Pre-processing for heterogeneous fit
  parinit <- lowerOpt <- upperOpt <- NULL
  
  if(trace > 2)
    cat("Initial value of the parameters:\n")
  
  if(is.null(known[["theta"]])){
    parinit <- init$theta
    lowerOpt <- lower
    upperOpt <- upper
    if(trace > 2) cat("Theta: ", init$theta, "\n")
  }
  
  if(is.null(known$Delta)){
    
    if(is.null(noiseControl$lowerDelta) || length(noiseControl$lowerDelta) != n){
      if(logN){
        noiseControl$lowerDelta <- log(eps)
      }else{
        noiseControl$lowerDelta <- eps
      }
    }
    
    if(length(noiseControl$lowerDelta) == 1){
      noiseControl$lowerDelta <- rep(noiseControl$lowerDelta, n)
    }
    
    if(is.null(noiseControl$g_max)) noiseControl$g_max <- 1e2 
    
    if(is.null(noiseControl$upperDelta) || length(noiseControl$upperDelta) != n){
      if(logN){
        noiseControl$upperDelta <- log(noiseControl$g_max)
      }else{
        noiseControl$upperDelta <- noiseControl$g_max
      }
    }
    
    if(length(noiseControl$upperDelta) == 1){
      noiseControl$upperDelta <- rep(noiseControl$upperDelta, n)
    }
    
    ## For now, only the values at pX are kept
    ## It could be possible to fit a pseudo-input GP to all of Delta
    if("pX" %in% components){
      init$Delta <- init$Delta[idcs_pX]
      noiseControl$lowerDelta <- noiseControl$lowerDelta[idcs_pX]
      noiseControl$upperDelta <- noiseControl$upperDelta[idcs_pX]
    }
    
    lowerOpt <- c(lowerOpt, noiseControl$lowerDelta)
    upperOpt <- c(upperOpt, noiseControl$upperDelta)
    parinit <- c(parinit, init$Delta)
    
    if(trace > 2) cat("Delta: ", init$Delta, "\n")
  }
  
  if(jointThetas && is.null(known$k_theta_g)){
    parinit <- c(parinit, init$k_theta_g)
    lowerOpt <- c(lowerOpt, noiseControl$k_theta_g_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$k_theta_g_bounds[2])
    if(trace > 2) cat("k_theta_g: ", init$k_theta_g, "\n")
  }
  
  if(!jointThetas && is.null(known$theta_g)){
    parinit <- c(parinit, init$theta_g)
    lowerOpt <- c(lowerOpt, noiseControl$lowerTheta_g)
    upperOpt <- c(upperOpt, noiseControl$upperTheta_g)
    if(trace > 2) cat("theta_g: ", init$theta_g, "\n")
  }
  
  if(is.null(known$g)){
    parinit <- c(parinit, init$g)
    if(trace > 2) cat("g: ", init$g, "\n")
    
    lowerOpt <- c(lowerOpt, noiseControl$g_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$g_bounds[2])
  }
  
  if("pX" %in% components){
    parinit <- c(parinit, as.vector(init$pX))
    lowerOpt <- c(lowerOpt, as.vector(rep(noiseControl$lowerpX, times = nrow(init$pX))))
    upperOpt <- c(upperOpt, as.vector(rep(noiseControl$upperpX, times = nrow(init$pX))))
    if(trace > 2) cat("pX: ", as.vector(init$pX), "\n")
  }
  
  
  ## Case when some parameters need to be estimated
  mle_par <- known # Store infered and known parameters
  if(!is.null(components)){
    
    if(!is.null(modHom)){
      hom_ll <- modHom$ll
    }else{
      ## Compute reference homoskedastic likelihood, with fixed theta for speed
      modHom_tmp <- mleHomGP(X = list(X0 = X0, Z0 = Z0, mult = mult), Z = Z, lower = lower, upper = upper, upper,
                             known = list(theta = known[["theta"]], g = known$g_H, beta0 = known$beta0), covtype = covtype, init = init,
                             noiseControl = list(g_bounds = c(noiseControl$g_min, noiseControl$g_max)), eps = eps,
                             settings = list(return.Ki = F))
      
      hom_ll <- modHom_tmp$ll
    } 
    
    ## Maximization of the log-likelihood
    envtmp <- environment()
    out <- try(optim(par = parinit, fn = fn, gr = gr, method = "L-BFGS-B", lower = lowerOpt, upper = upperOpt, X0 = X0, Z0 = Z0, Z = Z,
                     mult = mult, logN = logN, SiNK = SiNK,
                     Delta = known$Delta, theta = known[["theta"]], g = known$g, k_theta_g = known$k_theta_g, theta_g = known$theta_g,
                     pX = known$pX, beta0 = known$beta0, hom_ll = hom_ll, env = envtmp,
                     control = list(fnscale = -1, maxit = maxit, factr = settings$factr, pgtol = settings$pgtol)))
    
    ## Catch errors when at least one likelihood evaluation worked
    if(is(out, "try-error"))
      out <- list(par = envtmp$arg_max, value = envtmp$max_loglik, counts = NA,
                  message = "Optimization stopped due to NAs, use best value so far")
    
    ## Temporary
    if(trace > 0){
      print(out$message)
      cat("Number of variables at boundary values:" , length(which(out$par == upperOpt)) + length(which(out$par == lowerOpt)), "\n")
    }
    
    if(trace > 1)
      cat("Name | Value | Lower bound | Upper bound \n")
    
    ## Post-processing
    idx <- 1
    if(is.null(known[["theta"]])){
      mle_par$theta <- out$par[1:length(init$theta)]
      idx <- idx + length(init$theta)
      if(trace > 1) cat("Theta |", mle_par$theta, " | ", lower, " | ", upper, "\n")
    }
    if(is.null(known$Delta)){
      mle_par$Delta <- out$par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
      if(trace > 1){
        for(ii in 1:ceiling(length(mle_par$Delta)/5)){
          i_tmp <- (1 + 5*(ii-1)):min(5*ii, length(mle_par$Delta))
          if(logN) cat("Delta |", mle_par$Delta[i_tmp], " | ", pmax(log(eps * mult[i_tmp]), init$Delta[i_tmp] - log(1000)), " | ", init$Delta[i_tmp] + log(100), "\n")
          if(!logN) cat("Delta |", mle_par$Delta[i_tmp], " | ", pmax(mult[i_tmp] * eps, init$Delta[i_tmp] / 1000), " | ", init$Delta[i_tmp] * 100, "\n")
        }
      }
    }
    if(jointThetas){
      if(is.null(known$k_theta_g)){
        mle_par$k_theta_g <- out$par[idx]
        idx <- idx + 1
      } 
      mle_par$theta_g <- mle_par$k_theta_g * mle_par$theta
      
      if(trace > 1) cat("k_theta_g |", mle_par$k_theta_g, " | ", noiseControl$k_theta_g_bounds[1], " | ", noiseControl$k_theta_g_bounds[2], "\n")
    }
    if(!jointThetas && is.null(known$theta_g)){
      mle_par$theta_g <- out$par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
      if(trace > 1) cat("theta_g |", mle_par$theta_g, " | ", noiseControl$lowerTheta_g, " | ", noiseControl$upperTheta_g, "\n")
    }
    if(is.null(known$g)){
      mle_par$g <- out$par[idx]
      idx <- idx + 1
      if(trace > 1) cat("g |", mle_par$g, " | ", noiseControl$g_bounds[1], " | ", noiseControl$g_bounds[2], "\n")
    }
    if(idx != (length(out$par) + 1)){
      mle_par$pX <- matrix(out$par[idx:length(out$par)], ncol = ncol(X0))
      if(trace > 1) cat("pX |", as.vector(mle_par$pX), " | ", as.vector(rep(noiseControl$lowerpX, times = nrow(init$pX))), " | ", as.vector(rep(noiseControl$upperpX, times = nrow(init$pX))), "\n")
    }
    
  }else{
    out <- list(message = "All hyperparameters given, no optimization \n", count = 0, value = NULL)
  }
  
  ## Computation of nu2
  
  if(penalty){
    ll_non_pen <- logLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = mle_par$Delta, theta = mle_par$theta, g = mle_par$g, k_theta_g = mle_par$k_theta_g, theta_g = mle_par$theta_g,
                            logN = logN, SiNK = SiNK, beta0 = mle_par$beta0, pX = mle_par$pX, covtype = covtype, eps = eps, SiNK_eps = SiNK_eps, penalty = FALSE, hom_ll = NULL, trace = trace)
  }else{
    ll_non_pen <- out$value
  }
  if(!is.null(modHom)){
    if(modHom$ll >= ll_non_pen){
      if(trace >= 0) cat("Homoskedastic model has higher log-likelihood: \n", modHom$ll, " compared to ", ll_non_pen, "\n")
      if(settings$checkHom){
        if(trace >= 0) cat("Return homoskedastic model \n")
        return(modHom)
      }
    }
  }
  
  if(is.null(mle_par$pX)){
    Cg <- cov_gen(X1 = X0, theta = mle_par$theta_g, type = covtype)
    Kgi <- chol2inv(chol(Cg + diag(eps + mle_par$g/mult)))
    
    nmean <- drop(rowSums(Kgi) %*% mle_par$Delta / sum(Kgi)) ## ordinary kriging mean
    
    nu_hat_var <- max(eps, drop(crossprod(mle_par$Delta - nmean, Kgi) %*% (mle_par$Delta - nmean))/length(mle_par$Delta))
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = mle_par$pX, theta_g = mle_par$theta_g, g = mle_par$g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * Cg %*% (Kgi %*% (mle_par$Delta - nmean))
    }else{
      M <- Cg %*% (Kgi %*% (mle_par$Delta - nmean))
    }
  }else{
    Kgi <- chol2inv(chol(add_diag(cov_gen(X1 = mle_par$pX, theta = mle_par$theta_g, type = covtype), eps + mle_par$g/mult)))
    
    kg <- cov_gen(X1 = X0, X2 = mle_par$pX, theta = mle_par$theta_g, type = covtype)
    
    nmean <- drop(rowSums(Kgi) %*% mle_par$Delta / sum(Kgi)) ## ordinary kriging mean
    
    nu_hat_var <- max(eps, drop(crossprod(mle_par$Delta - nmean, Kgi) %*% (mle_par$Delta - nmean))/length(mle_par$Delta))
    
    if(SiNK){
      rhox <- 1 / rho_AN(xx = X0, X0 = mle_par$pX, theta_g = mle_par$theta_g, g = mle_par$g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
      M <-  rhox * kg %*% (Kgi %*% (mle_par$Delta - nmean))
    }else{
      M <- kg %*% (Kgi %*% (mle_par$Delta - nmean))
    }
  }
  
  
  Lambda <- drop(nmean + M)
  
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  
  LambdaN <- rep(Lambda, times = mult)
  
  Ki <- chol2inv(chol(add_diag(cov_gen(X1 = X0, theta = mle_par$theta, type = covtype), Lambda/mult + eps))) 
  
  if(is.null(known$beta0))
    mle_par$beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - mle_par$beta0, Ki) %*% (Z0 - mle_par$beta0))
  
  nu2 <- 1/length(Z) * (crossprod((Z - mle_par$beta0)/LambdaN, Z - mle_par$beta0) - crossprod((Z0 - mle_par$beta0) * mult/Lambda, Z0 - mle_par$beta0) + psi_0)
  
  res <- list(theta = mle_par$theta, Delta = mle_par$Delta, nu_hat = as.numeric(nu2), beta0 = mle_par$beta0,
              k_theta_g = mle_par$k_theta_g, theta_g = mle_par$theta_g, g = mle_par$g, nmean = nmean, Lambda = Lambda,
              ll = out$value, ll_non_pen = ll_non_pen, nit_opt = out$counts, logN = logN, SiNK = SiNK, covtype = covtype, pX = mle_par$pX, msg = out$message,
              X0 = X0, Z0 = Z0, Z = Z, mult = mult, trendtype = trendtype, eps = eps,
              nu_hat_var = nu_hat_var, call = match.call(),
              used_args = list(noiseControl = noiseControl, settings = settings, lower = lower, upper = upper, known = known),
              time = proc.time()[3] - tic)
  if(SiNK){
    res <- c(res, list(SiNK_eps = SiNK_eps))
  }
  
  if(settings$return.matrices){
    res <- c(res, list(Ki = Ki, Kgi = Kgi))
  }
  
  if(settings$return.hom){
    res <- c(res, list(modHom = modHom, modNugs = modNugs))
  }
  
  class(res) <- "hetGP"
  
  return(res)
  
}

if(!isGeneric("predict")) {
  setGeneric(name = "predict",
             def = function(object, ...) standardGeneric("predict")
  )
}

#'Gaussian process predictions using a heterogeneous noise GP object (of class \code{hetGP}) 
#' @param x matrix of designs locations to predict at (one point per row)
#' @param object an object of class \code{hetGP}; e.g., as returned by \code{\link[hetGP]{mleHetGP}}
#' @param noise.var should the variance of the latent variance process be returned?
#' @param xprime optional second matrix of predictive locations to obtain the predictive covariance matrix between \code{x} and \code{xprime}
#' @param nugs.only if \code{TRUE}, only return noise variance prediction
#' @param ... no other argument for this method.
#' @return list with elements
#' \itemize{
#' \item \code{mean}: kriging mean;
#' \item \code{sd2}: kriging variance (filtered, e.g. without the nugget values)
#' \item \code{nugs}: noise variance prediction
#' \item \code{sd2_var}: (returned if \code{noise.var = TRUE}) kriging variance of the noise process (i.e., on log-variances if \code{logN = TRUE})
#' \item \code{cov}: (returned if \code{xprime} is given) predictive covariance matrix between \code{x} and \code{xprime}
#' }
#' @details The full predictive variance corresponds to the sum of \code{sd2} and \code{nugs}.
#' See \code{\link[hetGP]{mleHetGP}} for examples.
#' @method predict hetGP 
#' @export
predict.hetGP <- function(object, x, noise.var = FALSE, xprime = NULL, nugs.only = FALSE, ...){
  
  if(is.null(dim(x))){
    x <- matrix(x, nrow = 1)
    if(ncol(x) != ncol(object$X0)) stop("x is not a matrix")
  }
  
  if(!is.null(xprime) && is.null(dim(xprime))){
    xprime <- matrix(xprime, nrow = 1)
    if(ncol(xprime) != ncol(object$X0)) stop("xprime is not a matrix")
  }
  
  if(is.null(object$Kgi)){
    if(is.null(object$pX)){
      Cg <- cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype)
    }else{
      Cg <- cov_gen(X1 = object$pX, theta = object$theta_g, type = object$covtype)
    }
    
    object$Kgi <- chol2inv(chol(add_diag(Cg, object$eps + object$g/object$mult)))
  }
  
  if(is.null(object$pX)){
    kg <- cov_gen(X1 = x, X2 = object$X0, theta = object$theta_g, type = object$covtype)
  }else{
    kg <- cov_gen(X1 = x, X2 = object$pX, theta = object$theta_g, type = object$covtype)
  }
  
  if(is.null(object$Ki))
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps)))
  
  if(object$SiNK){
    M <-  1/rho_AN(xx = x, X0 = object$pX, theta_g = object$theta_g, g = object$g,
                   type = object$covtype, SiNK_eps = object$SiNK_eps, eps = object$eps, mult = object$mult) * kg %*% (object$Kgi %*% (object$Delta - object$nmean))
  }else{
    M <- kg %*% (object$Kgi %*% (object$Delta - object$nmean))
  }
  
  if(object$logN){
    nugs <- object$nu_hat * exp(drop(object$nmean + M))
  }else{
    nugs <- object$nu_hat * pmax(0, drop(object$nmean + M))
  }
  
  if(nugs.only){
    return(list(nugs = nugs))
  }
  
  if(noise.var){
    if(is.null(object$nu_hat_var))
      object$nu_hat_var <- max(object$eps, drop(crossprod(object$Delta - object$nmean, object$Kgi) %*% (object$Delta - object$nmean))/length(object$Delta)) ## To avoid 0 variance
    sd2var <- object$nu_hat * object$nu_hat_var* drop(1 - fast_diag(kg, tcrossprod(object$Kgi, kg)) + (1 - tcrossprod(rowSums(object$Kgi), kg))^2/sum(object$Kgi))
  }else{
    sd2var <- NULL
  }
  
  kx <- cov_gen(X1 = x, X2 = object$X0, theta = object$theta, type = object$covtype)
  
  if(object$trendtype == 'SK'){
    sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)))
  }else{
    sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
  }
  
  ## In case of numerical errors, some sd2 values may become negative
  if(any(sd2 < 0)){
    # object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps))
    # 
    # if(object$trendtype == 'SK'){
    #   sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)))
    # }else{
    #   sd2 <- object$nu_hat * drop(1 - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
    # }
    sd2 <- pmax(0, sd2)
    warning("Numerical errors caused some negative predictive variances to be thresholded to zero. Consider using ginv via rebuild.hetGP")
  }
  
  if(!is.null(xprime)){
    kxprime <- cov_gen(X1 = object$X0, X2 = xprime, theta = object$theta, type = object$covtype)
    if(object$trendtype == 'SK'){
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime) 
      }else{
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*% kxprime))
      }
    }else{
      if(nrow(x) < nrow(xprime)){
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki))
      }else{
        cov <- object$nu_hat * (cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*%  kxprime) + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - rowSums(object$Ki) %*% kxprime)/sum(object$Ki))
      }
    }
  }else{
    cov = NULL
  }
  
  return(list(mean = drop(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0))),
              sd2 = sd2,
              nugs = nugs,
              sd2var = sd2var,
              cov = cov))
}


#' @method summary hetGP
#' @export
summary.hetGP <- function(object,...){
  ans <- object
  class(ans) <- "summary.hetGP"
  ans
}

#' @export
print.summary.hetGP <- function(x, ...){
  cat("N = ", length(x$Z), " n = ", length(x$Z0), " d = ", ncol(x$X0), "\n")
  cat(x$covtype, " covariance lengthscale values of the main process: ", x$theta, "\n")
  cat("Variance/scale hyperparameter: ", x$nu_hat, "\n")
  
  cat("Summary of Lambda values: \n")
  print(summary(x$Lambda))
  
  if(x$trendtype == "SK"){
    cat("Given constant trend value: ", x$beta0, "\n")
  }else{
    cat("Estimated constant trend value: ", x$beta0, "\n")
  }
  
  if(x$logN){
    cat(x$covtype, " covariance lengthscale values of the log-noise process: ", x$theta_g, "\n")
    cat("Nugget of the log-noise process: ", x$g, "\n")
    cat("Estimated constant trend value of the log-noise process: ", x$nmean, "\n")
  }else{
    cat(x$covtype, " covariance lengthscale values of the log-noise process: ", x$theta_g, "\n")
    cat("Nugget of the noise process: ", x$g, "\n")
    cat("Estimated constant trend value of the noise process: ", x$nmean, "\n")
  }
  
  cat("MLE optimization: \n", "Log-likelihood = ", x$ll, "; Nb of evaluations (obj, gradient) by L-BFGS-B: ", x$nit_opt, "; message: ", x$msg, "\n")
  
}

#' @method print hetGP
#' @export
print.hetGP <- function(x, ...){
  print(summary(x))
}

#' @method plot hetGP
#' @export
plot.hetGP <- function(x, ...){
  LOOpreds <- LOO_preds(x)
  plot(x$Z, LOOpreds$mean[rep(1:nrow(x$X0), times = x$mult)], xlab = "Observed values", ylab = "Predicted values",
       main = "Leave-one-out predictions")
  arrows(x0 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.05),
         x1 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qnorm(0.95),
         y0 = LOOpreds$mean, length = 0, col = "blue")
  points(x$Z0[which(x$mult > 1)], LOOpreds$mean[which(x$mult > 1)], pch = 20, col = 2)
  abline(a = 0, b = 1, lty = 3)
  legend("topleft", pch = c(1, 20, NA), lty = c(NA, NA, 1), col = c(1, 2, 4),
         legend = c("observations", "averages (if > 1 observation)", "LOO prediction interval"))
}

## ' Rebuild inverse covariance matrices of \code{hetGP} (e.g., if exported without inverse matrices \code{Kgi} and/or \code{Ki})
## ' @param object \code{hetGP} model without slots \code{Ki} and/or \code{Kgi} (inverse covariance matrices)
## ' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
#' @method rebuild hetGP
#' @rdname ExpImp
#' @export
rebuild.hetGP <- function(object, robust = FALSE){
  
  if(is.null(object$pX)){
    Cg <- cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype)
  }else{
    Cg <- cov_gen(X1 = object$pX, theta = object$theta_g, type = object$covtype)
  }
  
  if(robust){
    object$Kgi <- ginv(add_diag(Cg, object$eps + object$g/object$mult))
    
    object$Ki <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps))
  }else{
    object$Kgi <- chol2inv(chol(add_diag(Cg, object$eps + object$g/object$mult)))
    
    object$Ki <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps)))
  }
  return(object)
}


# ###############################################################################
# ## Appendix: low level functions
# ###############################################################################

#' Prepare data for use with \code{\link[hetGP]{mleHetGP}}, in particular to find replicated observations
#' @title Data preprocessing 
#' @param X matrix of design locations, one point per row
#' @param Z vector of observations at \code{X}
#' @param return.Zlist to return \code{Zlist}, see below
#' @param rescale if \code{TRUE}, the inputs are rescaled to the unit hypercube
#' @param normalize if \code{TRUE}, the outputs are centered and normalized
#' @param inputBounds optional matrix of known boundaries in original input space, of size 2 times \code{ncol(X)}. 
#' If not provided, and \code{rescale == TRUE}, it is estimated from the data.   
#' @return A list with the following elements that can be passed to the main fitting functions, e.g., \code{\link{mleHetGP}} and \code{\link{mleHomGP}}
#' \itemize{
#' \item \code{X0} matrix with unique designs locations, one point per row,
#' \item \code{Z0} vector of averaged observations at \code{X0},
#' \item \code{mult} number of replicates at \code{X0},
#' \item \code{Z} vector with all observations, sorted according to \code{X0},
#' \item \code{Zlist} optional list, each element corresponds to observations at a design in \code{X0},
#' \item \code{inputBounds} optional matrix, to rescale back to the original input space,
#' \item \code{outputStats} optional vector, with mean and variance of the original outputs.
#' }
#' @details Replicates are searched based on character representation, using \code{\link[base]{unique}}.
#' @examples 
#' ##------------------------------------------------------------
#' ## Find replicates on the motorcycle data
#' ##------------------------------------------------------------
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' 
#' data_m <- find_reps(X, Z)
#' 
#' # Initial data
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' # Display mean values
#' points(data_m$X0, data_m$Z0, pch = 20)
#' @export
find_reps <- function(X, Z, return.Zlist = TRUE, rescale = FALSE, normalize = FALSE, inputBounds = NULL){
  if(is.null(dim(X)))
    X <- matrix(X, ncol = 1)
  
  if(nrow(X) == 1){
    if(return.Zlist)
      return(list(X0 = X, Z0 = Z, mult = 1, Z = Z,
                  Zlist = list(Z)))
    return(list(X0 = X, Z0 = Z, mult = 1, Z = Z))
  }
  
  if(rescale){
    if(is.null(inputBounds))
      inputBounds <- apply(X, 2, range)
    X <- (X - matrix(inputBounds[1,], nrow = nrow(X), ncol = ncol(X), byrow = TRUE)) %*% diag(1/(inputBounds[2,] - inputBounds[1,]), ncol(X)) 
  }
  
  outputStats <- NULL
  if(normalize){
    outputStats <- c(mean(Z), var(Z))
    Z <- (Z - outputStats[1])/sqrt(outputStats[2])
  }
  
  X0 <- unique(X)
  if(nrow(X) == nrow(X0)){
    if(return.Zlist)
      return(list(X0 = X, Z0 = Z, mult = rep(1, length(Z)), Z = Z, Zlist = as.list(Z),
                  inputBounds = inputBounds, outputStats = outputStats))
    return(list(X0 = X, Z0 = Z, mult = rep(1, length(Z)), Z = Z,
                inputBounds = inputBounds, outputStats = outputStats))
  }
  
  corresp <- find_corres(X0, X)
  Zlist <- split(Z, corresp)
  mult <- as.numeric(unlist(lapply(Zlist, length)))
  
  if(return.Zlist)
    return(list(X0 = X0, Z0 = unlist(lapply(Zlist, mean)), mult = mult, Z = unlist(Zlist),
                Zlist = Zlist, inputBounds = inputBounds, outputStats = outputStats))
  return(list(X0 = X0, Z0 = unlist(lapply(Zlist, mean)), mult = mult, Z = unlist(Zlist), inputBounds = inputBounds,
              outputStats = outputStats))
}

#' @title Generic Log-likelihood function
#' This function can be used to compute loglikelihood for homGP/hetGP models
#' @details For hetGP, this is not the joint log-likelihood, only the likelihood of the mean process.
#' @param X0 unique designs
#' @param Z0 averaged observations
#' @param Z replicated observations (sorted with respect to X0)
#' @param mult number of replicates at each Xi
#' @param Delta vector of nuggets corresponding to each X0i or pXi, that are smoothed to give Lambda
#' @param logN should exponentiated variance be used
#' @param theta scale parameter for the mean process, either one value (isotropic) or a vector (anistropic)
#' @param k_theta_g constant used for linking nuggets lengthscale to mean process lengthscale, i.e., theta_g[k] = k_theta_g * theta[k], alternatively theta_g can be used
#' @param theta_g either one value (isotropic) or a vector (anistropic), alternative to using k_theta_g
#' @param g nugget of the nugget process
#' @param beta0 mean, if not provided, the MLE estimator is used
#' @param eps minimal value of elements of Lambda
#' @param covtype covariance kernel type
#' @keywords internal
#' @export
logLikH <- function(X0, Z0, Z, mult, theta, g, Delta = NULL, k_theta_g = NULL, theta_g = NULL, logN = FALSE,
                    beta0 = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian"){
  
  if(is.null(Delta)){
    return(logLikHom(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, beta0 = beta0, covtype = covtype, eps = eps))
  }else{
    if(!is.null(k_theta_g)) theta_g <- NULL
    return(logLikHet(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, Delta = Delta, g = g, k_theta_g = k_theta_g,
                     theta_g = theta_g, logN = logN, penalty = FALSE, hom_ll = NULL, trace = 0))
  }
}

#' @title Score and RMSE function
#' To asses the performance of the prediction, this function computes the root mean squared error and proper score function (also known as negative log-probability density).
#' @references 
#' T. Gneiting, and A. Raftery (2007). Strictly Proper Scoring Rules, Prediction, and Estimation, Journal of the American Statistical Association, 102(477), 359-378.
#' @param model \code{homGP} or \code{hetGP} model, including inverse matrices
#' @param Xtest matrix of new design locations
#' @param Ztest corresponding vector of observations, or alternatively, 
#' a matrix of size [nrow(Xtest) x number of replicates], a list of size nrow(Xtest) with a least one value per element
#' @param return.rmse if \code{TRUE}, return the root mean squared error
#' @export
scores <- function(model, Xtest, Ztest, return.rmse = FALSE){
  p <- predict(model, Xtest)
  ps2 <- p$sd2 + p$nugs
  
  if(is.list(Ztest)){
    s2 <- m <- Ztest
    for(i in 1:length(Ztest)) {
      m[[i]] <- rep(p$mean[i], length(m[[i]]))
      s2[[i]] <- rep(ps2[i], length(s2[[i]]))
    }
    m <- unlist(t(m))
    s2 <- unlist(t(s2))
    se <- (unlist(t(Ztest)) - m)^2
    sc <- - se/s2 - log(s2)
  }
  
  if(is.vector(Ztest, mode = "numeric")){
    se <- (Ztest - p$mean)^2
    sc <- -se/ps2 - log(ps2)
  }
  
  if(is.matrix(Ztest)){
    nc <- ncol(Ztest)
    m <- as.numeric(t(matrix(rep(p$mean, nc), ncol = nc)))
    s2 <- as.numeric(t(matrix(rep(ps2, nc), ncol = nc)))
    se <- (unlist(t(Ztest)) - m)^2
    sc <- - se/s2 - log(s2)
  }
  
  if(return.rmse) return(list(sc = mean(sc), rmse = sqrt(mean(se))))
  return(mean(sc)) 
}

#' Defines lower bound for lengthscale parameters based on a low quantile of non-zero distances between points in the design.
#' @param X design matrix
#' @param min_cor minimal correlation between two design points at the defined quantile distance, default to 0.01
#' @param max_cor maximal correlation between two design points at the defined (1-p) quantile distance, default to 0.5
#' @param p quantile on distances, default to 0.1
#' @param covtype covariance function used
#' @importFrom stats quantile uniroot
#' @noRd
auto_bounds <- function(X, min_cor = 0.01, max_cor = 0.5, covtype = "Gaussian", p = 0.05){
  Xsc <- find_reps(X, rep(1, nrow(X)), rescale = T) # rescaled distances
  
  dists <- distance_cpp(Xsc$X0) # find 2 closest points
  repr_low_dist <- quantile(x = dists[lower.tri(dists)], probs = p, names = FALSE) # (quantile on squared Euclidean distances)
  repr_lar_dist <- quantile(x = dists[lower.tri(dists)], probs = 1-p, names = FALSE)
  
  if(covtype == "Gaussian"){
    theta_min <- - repr_low_dist/log(min_cor)
    theta_max <- - repr_lar_dist/log(max_cor)
    return(list(lower = theta_min * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,])^2,
                upper = theta_max * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,])^2))
  }else{
    tmpfun <- function(theta, repr_dist, covtype, value){
      cov_gen(matrix(sqrt(repr_dist/ncol(X)), ncol = ncol(X)), matrix(0, ncol = ncol(X)), type = covtype, theta = theta) - value
    }
    theta_min <- try(uniroot(tmpfun, interval = c(sqrt(.Machine$double.eps), 100), covtype = covtype, value = min_cor, 
                             repr_dist = repr_low_dist, tol = sqrt(.Machine$double.eps))$root)
    if(is(theta_min, "try-error")){
      warning("The automatic selection of lengthscales bounds was not successful. Perhaps provide lower and upper values.")
      theta_min <- 1e-2
    }
    theta_max <- try(uniroot(tmpfun, interval = c(sqrt(.Machine$double.eps), 100), covtype = covtype, value = max_cor,
                             repr_dist = repr_lar_dist, tol = sqrt(.Machine$double.eps))$root, silent = TRUE)
    if(is(theta_max, "try-error")){
      theta_max <- 5
    }
    return(list(lower = theta_min * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,]),
                upper = max(1, theta_max) * (Xsc$inputBounds[2,] - Xsc$inputBounds[1,])))
  }
} 

# Rho function for SiNK prediction, anistropic case
## @param covtype covariance kernel type, either 'Gaussian' or 'Matern5_2'
rho_AN <- function(xx, X0, theta_g, g, sigma = 1, type = "Gaussian", SiNK_eps = 1e-4, eps = sqrt(.Machine$double.eps), mult){
  if(is.null(nrow(xx)))
    xx <- matrix(xx, nrow = 1)
  
  K <- sigma * cov_gen(X1 = X0, theta = theta_g, type = type) + diag(eps + g/mult, nrow(X0))
  
  k <- sigma * cov_gen(X1 = xx, X2 = X0, theta = theta_g, type = type)
  
  return(pmax(SiNK_eps, sqrt(diag(k %*% chol2inv(chol(K)) %*% t(k)))/sigma^2))
}
Any scripts or data that you put into this service are public.
hetGP documentation built on Sept. 11, 2024, 6:56 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
hetGP
Heteroskedastic Gaussian Process Modeling and Design under Replication

R/hetGP.R
In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication

Documented in compareGP find_reps logLikH mleHetGP mleHomGP predict.hetGP predict.homGP rebuild.hetGP rebuild.homGP scores

Try the hetGP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hetGP Heteroskedastic Gaussian Process Modeling and Design under Replication

R/hetGP.R In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication

Documented in compareGP find_reps logLikH mleHetGP mleHomGP predict.hetGP predict.homGP rebuild.hetGP rebuild.homGP scores

Try the hetGP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hetGP
Heteroskedastic Gaussian Process Modeling and Design under Replication

R/hetGP.R
In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication