R/hetTP.R
In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication

Documented in mleHetTP mleHomTP predict.hetTP predict.homTP rebuild.hetTP rebuild.homTP

#  dmt(Z, mean = rep(0,133), S = (nu - 2)/nu*(model$sigma2 * cov_gen(X, theta = model$theta) + diag(model$g, 133)), df = model$nu, log = T)
loglik_HomTP <- function(X0, Z0, Z, mult, theta, g, nu, sigma2, beta0 = 0,
                         covtype = "Gaussian", eps = sqrt(.Machine$double.eps), env = NULL){
  n <- nrow(X0)
  N <- length(Z)
  
  C <- cov_gen(X1 = X0, theta = theta, type = covtype)
  # Temporarily store Cholesky transform of K in Ki
  Ki <- chol(add_diag(sigma2 * C, eps + g/mult))
  ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
  Ki <- chol2inv(Ki)
  
  if(!is.null(env)){
    env$C <- C
    env$Ki <- Ki
  } 
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- (crossprod(Z - beta0) - crossprod((Z0 - beta0) * mult, Z0 - beta0))/g + psi_0
  
  return(-N/2 * log((nu - 2) * pi) + ldetKi/2 - (N - n)/2 * log(g) - 1/2 * sum(log(mult)) + lgamma((nu + N)/2) - lgamma(nu/2) - (nu + N)/2 * log(1 + psi/(nu - 2)))
}


# derivative of log-likelihood for logLikHom with respect to theta with all observations (Gaussian kernel)
## Model: noisy observations with unknown homoskedastic noise
## K = tau^2 * C + g * I
# X0  design matrix (no replicates)
# Z0 averaged observations
# Z observations vector (all observations)
# mult number of replicates per unique design point
# theta vector of lengthscale hyperparameters (or one for isotropy)
# g noise variance for the process
# tau2 scale
# nu degrees of freedom
# beta0 trend
## @return gradient with respect to theta and g
dlogLikHomTP <- function(X0, Z0, Z, mult, theta, g, nu, sigma2, beta0 = 0, covtype = "Gaussian", eps = sqrt(.Machine$double.eps),
                         components = c("theta", "g", "nu", "sigma2"), env = NULL){
  N <- length(Z)
  n <- nrow(X0)
  
  if(!is.null(env)){
    C <- env$C
    Ki <- env$Ki
  }else{
    C <- cov_gen(X1 = X0, theta = theta, type = covtype)
    Ki <- chol2inv(chol(sigma2*C + diag(eps + g / mult)))
  }
  
  Z0 <- Z0 - beta0
  Z <- Z - beta0
  
  KiZ0 <- Ki %*% Z0 ## to avoid recomputing  
  
  psi_0 <- drop(crossprod(Z0, KiZ0))
  
  psi <- (crossprod(Z) - crossprod(Z0 * mult, Z0))/g + psi_0
  
  
  tmp1 <- tmp2 <- tmp3 <- tmp4 <- NULL
  
  # First component, derivative with respect to theta
  if("theta" %in% components){
    tmp1 <- rep(NA, length(theta))
    if(length(theta)==1){
      dC_dthetak <- partial_cov_gen(X1 = X0, theta = theta, type = covtype, arg = "theta_k") * C
      tmp1 <- sigma2 * (nu + N)/(2 * (nu + psi - 2)) * crossprod(KiZ0, dC_dthetak) %*% KiZ0  - sigma2/2 * trace_sym(Ki, dC_dthetak)
    }else{
      for(i in 1:length(theta)){
        dC_dthetak <- partial_cov_gen(X1 = X0[,i, drop = F], theta = theta[i], type = covtype, arg = "theta_k") * C
        tmp1[i] <- sigma2 * (nu + N)/(2 * (nu + psi - 2)) * crossprod(KiZ0, dC_dthetak) %*% KiZ0 - sigma2/2 * trace_sym(Ki, dC_dthetak)
      }
    } 
  }
  
  if("nu" %in% components) 
    tmp2 <- -N / (2*(nu - 2)) + 1/2*digamma((nu + N)/2) - 1/2*digamma(nu/2) - 1/2 * log(1 + psi/(nu - 2)) + psi * (nu + N)/(2*(nu - 2)^2 + 2 * psi * (nu - 2))
  
  if("sigma2" %in% components) 
    tmp3 <- (nu + N)/(2 * (nu + psi - 2)) * crossprod(KiZ0, C) %*% KiZ0 -1/2 * trace_sym(Ki, C)
  
  # Fourth component derivative with respect to g
  if("g" %in% components)
    tmp4 <- (nu + N)/(2 * (nu + psi - 2)) * ((crossprod(Z) - crossprod(Z0 * mult, Z0))/g^2 + sum(KiZ0^2/mult)) - (N - n)/ (2*g) - 1/2 * sum(diag(Ki)/mult)
  
  return(c(tmp1, tmp2, tmp3, tmp4))
}



#' Student-t process regression under homoskedastic noise based on maximum likelihood estimation of the 
#' hyperparameters. This function is enhanced to deal with replicated observations.
#' @title Student-T process modeling with homoskedastic noise
#' @param X matrix of all designs, one per row, or list with elements:
#' \itemize{
#'   \item \code{X0} matrix of unique design locations, one point per row
#'   \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
#'   \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
#' } 
#' @param Z vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}
#' @param lower,upper bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
#' In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy) 
#' @param known optional list of known parameters, e.g., \code{beta0} (default to \code{0}), \code{theta}, \code{g}, \code{sigma2} or \code{nu}
#' @param covtype covariance kernel type, either 'Gaussian', 'Matern5_2' or 'Matern3_2', see \code{\link[hetGP]{cov_gen}}
#' @param noiseControl list with element,
#' \itemize{
#' \item \code{g_bound}, vector providing minimal and maximal noise variance
#' \item \code{sigma2_bounds}, vector providing minimal and maximal signal variance
#' \item \code{nu_bounds}, vector providing minimal and maximal values for the degrees of freedom. 
#' The mininal value has to be stricly greater than 2. If the mle optimization gives a large value, e.g., 30,
#' considering a GP with \code{\link[hetGP]{mleHomGP}} may be better. 
#' } 
#' @param init list specifying starting values for MLE optimization, with elements:
#' \itemize{
#'  \item \code{theta_init} initial value of the theta parameters to be optimized over (default to 10\% of the range determined with \code{lower} and \code{upper})
#'  \item \code{g_init} initial value of the nugget parameter to be optimized over (based on the variance at replicates if there are any, else 10\% of the variance)
#'  \item \code{sigma2} initial value of the variance paramter (default to \code{1})
#'  \item \code{nu} initial value of the degrees of freedom parameter (default to \code{3})
#' }
#' @param maxit maximum number of iteration for L-BFGS-B of \code{\link[stats]{optim}}
#' @param eps jitter used in the inversion of the covariance matrix for numerical stability
#' @param settings list with argument \code{return.Ki}, to include the inverse covariance matrix in the object for further use (e.g., prediction).
#' Arguments \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}}.
#' @return a list which is given the S3 class "\code{homGP}", with elements:
#' \itemize{
#' \item \code{theta}: maximum likelihood estimate of the lengthscale parameter(s),
#' \item \code{g}: maximum likelihood estimate of the nugget variance,
#' \item \code{trendtype}: either "\code{SK}" if \code{beta0} is given, else "\code{OK}" 
#' \item \code{beta0}: estimated trend unless given in input,
#' \item \code{sigma2}:  maximum likelihood estimate of the scale variance,
#' \item \code{nu2}:  maximum likelihood estimate of the degrees of freedom parameter,
#' \item \code{ll}: log-likelihood value,
#' \item \code{X0}, \code{Z0}, \code{Z}, \code{mult}, \code{eps}, \code{covtype}: values given in input,
#' \item \code{call}: user call of the function
#' \item \code{used_args}: list with arguments provided in the call
#' \item \code{nit_opt}, \code{msg}: \code{counts} and \code{msg} returned by \code{\link[stats]{optim}}
#' \item \code{Ki}, inverse covariance matrix (if \code{return.Ki} is \code{TRUE} in \code{settings})
#' \item \code{time}: time to train the model, in seconds.
#' 
#'}
#' @details
#' The global covariance matrix of the model is parameterized as \code{K = sigma2 * C + g * diag(1/mult)},
#' with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices).
#' 
#' It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.
#' 
#' @seealso \code{\link[hetGP]{predict.homTP}} for predictions.
#' \code{summary} and \code{plot} functions are available as well. 
#' @references 
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902.\cr \cr
#' 
#' A. Shah, A. Wilson, Z. Ghahramani (2014), Student-t processes as alternatives to Gaussian processes, Artificial Intelligence and Statistics, 877--885. \cr \cr
#' 
#' M. Chung, M. Binois, RB Gramacy, DJ Moquin, AP Smith, AM Smith (2019). 
#' Parameter and Uncertainty Estimation for Dynamical Systems Using Surrogate Stochastic Processes.
#' SIAM Journal on Scientific Computing, 41(4), 2212-2238.\cr
#' Preprint available on arXiv:1802.00852. 
#' @examples
#' ##------------------------------------------------------------
#' ## Example 1: Homoskedastic Student-t modeling on the motorcycle data
#' ##------------------------------------------------------------
#' set.seed(32)
#' 
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' 
#' noiseControl = list(g_bounds = c(1e-3, 1e4))
#' model <- mleHomTP(X = X, Z = Z, lower = 0.01, upper = 100, noiseControl = noiseControl)
#' summary(model)
#'   
#' ## Display averaged observations
#' points(model$X0, model$Z0, pch = 20) 
#' xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
#' preds <- predict(x = xgrid, object =  model)
#' 
#' ## Display mean prediction
#' lines(xgrid, preds$mean, col = 'red', lwd = 2)
#' ## Display 95% confidence intervals
#' lines(xgrid, preds$mean + sqrt(preds$sd2) * qt(0.05, df = model$nu + nrow(X)), col = 2, lty = 2)
#' lines(xgrid, preds$mean + sqrt(preds$sd2) * qt(0.95, df = model$nu + nrow(X)), col = 2, lty = 2)
#' ## Display 95% prediction intervals
#' lines(xgrid, preds$mean + sqrt(preds$sd2 + preds$nugs) * qt(0.05, df = model$nu + nrow(X)), 
#'   col = 3, lty = 2)
#' lines(xgrid, preds$mean + sqrt(preds$sd2 + preds$nugs) * qt(0.95, df = model$nu + nrow(X)), 
#'   col = 3, lty = 2)
#' @export
mleHomTP <- function(X, Z, lower = NULL, upper = NULL, known = list(beta0 = 0),
                     noiseControl = list(g_bounds = c(sqrt(.Machine$double.eps), 1e4),
                                         nu_bounds = c(2 + 1e-3, 30),
                                         sigma2_bounds = c(sqrt(.Machine$double.eps), 1e4)),
                     init = list(nu = 3), covtype = c("Gaussian", "Matern5_2", "Matern3_2"), maxit = 100,
                     eps = sqrt(.Machine$double.eps),
                     settings = list(return.Ki = TRUE, factr = 1e9)){
  
  if(typeof(X)=="list"){
    X0 <- X$X0
    Z0 <- X$Z0
    mult <- X$mult
    if(sum(mult) != length(Z)) stop("Length(Z) should be equal to sum(mult)")
    if(is.null(dim(X0))) X0 <- matrix(X0, ncol = 1)
    if(length(Z0) != nrow(X0)) stop("Dimension mismatch between Z0 and X0")
  }else{
    if(is.null(dim(X))) X <- matrix(X, ncol = 1)
    if(nrow(X) != length(Z)) stop("Dimension mismatch between Z and X")
    elem <- find_reps(X, Z, return.Zlist = F)
    X0 <- elem$X0
    Z0 <- elem$Z0
    Z <- elem$Z
    mult <- elem$mult
  }
  
  covtype <- match.arg(covtype)
  
  if(is.null(lower) || is.null(upper)){
    auto_thetas <- auto_bounds(X = X0, covtype = covtype)
    if(is.null(lower)) lower <- auto_thetas$lower
    if(is.null(upper)) upper <- auto_thetas$upper
    if(is.null(known[["theta"]]) && is.null(init$theta)) init$theta <- sqrt(upper * lower)
  }
  
  if(length(lower) != length(upper)) stop("upper and lower should have the same size")
  
  ## Save time to train model
  tic <- proc.time()[3]
  
  if(is.null(settings$return.Ki)) settings$return.Ki <- TRUE
  if(is.null(settings$factr)) settings$factr <- 1e9
  if(is.null(noiseControl$g_bounds)) noiseControl$g_bounds <- c(sqrt(.Machine$double.eps), 1e4)
  if(is.null(noiseControl$sigma2_bounds)) noiseControl$sigma2_bounds <- c(sqrt(.Machine$double.eps), 1e4)
  if(is.null(noiseControl$nu_bounds)) noiseControl$nu_bounds <- c(2 + 1e-3, 30)
  
  if(is.null(known$beta0)) known$beta0 <- 0
  beta0 <- known$beta0
  
  N <- length(Z)
  n <- nrow(X0)
  
  if(is.null(n))
    stop("X0 should be a matrix. \n")
  
  if(is.null(known[["theta"]]) && is.null(init$theta)) init$theta <- 0.1 * lower + 0.9 * upper
  if(is.null(known$g) && is.null(init$g)){
    if(any(mult > 2)) init$g <- mean((fast_tUY2(mult, (Z - rep(Z0, times = mult))^2)/mult)[which(mult > 2)]) else init$g <- 0.1 * var(Z0)
  } 
  if(is.null(known$nu) && is.null(init$nu)) init$nu <- 3
  if(is.null(known$sigma2) && is.null(init$sigma2)) init$sigma2 <- var(Z0)

  parinit <- c(init$theta, init$nu, init$sigma2, init$g)
  
  # trendtype <- 'OK'
  # if(!is.null(beta0))
  #   trendtype <- 'SK'
  
  ## General definition of fn and gr
  fn <- function(par, X0, Z0, Z, mult, beta0, theta, sigma2, nu, g, env){
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      idx <- length(init$theta)
      theta <- par[1:idx]
      idx <- idx + 1
    }
    if(is.null(nu)){
      nu <- par[idx]
      idx <- idx + 1
    }
    if(is.null(sigma2)){
      sigma2 <- par[idx]
      idx <- idx + 1
    }
    if(is.null(g)){
      g <- par[idx]
    }
    
    loglik <- loglik_HomTP(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, nu = nu, sigma2 = sigma2,
                           beta0 = beta0, covtype = covtype, eps = eps, env = env)
    
    if(!is.null(env) && !is.na(loglik)){
      if(is.null(env$max_loglik) || loglik > env$max_loglik){
        env$max_loglik <- loglik
        env$arg_max <- par
      }
    } 
    
    return(loglik)
  }
  
  gr <- function(par, X0, Z0, Z, mult, beta0, theta, sigma2, nu, g, env){
    idx <- 1
    components <- NULL
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
      components <- "theta"
    }
    if(is.null(nu)){
      nu <- par[idx]
      idx <- idx + 1
      components <- c(components, "nu")
    }
    if(is.null(sigma2)){
      sigma2 <- par[idx]
      idx <- idx + 1
      components <- c(components, "sigma2")
    }
    if(is.null(g)){
      g <- par[idx]
      components <- c(components, "g")
    }
    
    return(dlogLikHomTP(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta, g = g, nu = nu, sigma2 = sigma2,
                        beta0 = beta0, covtype = covtype, eps = eps, components = components, env = env))
  }
  
  ## All known
  if(!is.null(known$g) && !is.null(known[["theta"]]) && !is.null(known$nu) && !is.null(known$sigma2)){
    theta_out <- known[["theta"]]
    nu_out <- known$nu
    sigma2_out <- known$sigma2
    g_out <- known$g
    out <- list(value = loglik_HomTP(X0 = X0, Z0 = Z0, Z = Z, mult = mult, theta = theta_out, g = g_out, nu = nu_out, sigma2 = sigma2_out, 
                                     beta0 = beta0, covtype = covtype, eps = eps),
                message = "All hyperparameters given", counts = 0, time = proc.time()[3] - tic)
  }else{
    parinit <- lowerOpt <- upperOpt <- NULL
    if(is.null(known[["theta"]])){
      parinit <- init$theta
      lowerOpt <- c(lower)
      upperOpt <- c(upper)
    }
    if(is.null(known$nu)){
      parinit <- c(parinit, init$nu)
      lowerOpt <- c(lowerOpt, noiseControl$nu_bounds[1])
      upperOpt <- c(upperOpt, noiseControl$nu_bounds[2])
    }
    if(is.null(known$sigma2)){
      parinit <- c(parinit, init$sigma2)
      lowerOpt <- c(lowerOpt, noiseControl$sigma2_bounds[1])
      upperOpt <- c(upperOpt, noiseControl$sigma2_bounds[2])
    }
    if(is.null(known$g)){
      parinit <- c(parinit, init$g)
      lowerOpt <- c(lowerOpt, noiseControl$g_bounds[1])
      upperOpt <- c(upperOpt, noiseControl$g_bounds[2])
    }
    
    
    # environment storing values from log to pass to dlog
    envtmp <- environment()
    out <- try(optim(par = parinit, fn = fn, gr = gr, method = "L-BFGS-B", lower = lowerOpt, upper = upperOpt, theta = known[["theta"]], 
                 nu = known$nu, sigma2 = known$sigma2, g = known$g, env = envtmp,
                 X0 = X0, Z0 = Z0, Z = Z, mult = mult, beta0 = beta0,
                 control = list(fnscale = -1, maxit = maxit, factr = settings$factr, pgtol = settings$pgtol)))
    ## Catch errors when at least one likelihood evaluation worked
    if(is(out, "try-error"))
      out <- list(par = envtmp$arg_max, value = envtmp$max_loglik, counts = NA,
                  message = "Optimization stopped due to NAs, use best value so far")
    
    ## Post-processing
    idx <- 1
    if(is.null(known[["theta"]])){
      theta_out <- out$par[1:length(init$theta)]
      idx <- idx + length(init$theta)
    }else{
      theta_out <- known$theta
    }
    if(is.null(known$nu)){
      nu_out <-  out$par[idx]
      idx <- idx + 1
    }else{
      nu_out <- known$nu
    }
    if(is.null(known$sigma2)){
      sigma2_out <-  out$par[idx]
      idx <- idx + 1
    }else{
      sigma2_out <- known$sigma2
    }
    if(is.null(known$g)){
      g_out <-  out$par[idx]
    }else{
      g_out <- known$g
    }
    
  }
  
  Ki <- chol2inv(chol(add_diag(sigma2_out * cov_gen(X1 = X0, theta = theta_out, type = covtype), eps + g_out/mult)))
  
  # if(is.null(beta0))
  #   beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- drop(crossprod(Z - beta0) - crossprod((Z0 - beta0) * mult, Z0 - beta0))/g_out + psi_0
  
  res <- list(theta = theta_out, g = g_out, nu = nu_out, sigma2 = sigma2_out, mult = mult,
              # trendtype = trendtype,
              ll = out$value, nit_opt = out$counts,
              psi = psi,
              beta0 = beta0,  covtype = covtype, msg = out$message, eps = eps,
              X0 = X0, Z = Z, mult = mult, Z0 = Z0, call = match.call(),
              used_args = list(lower = lower, upper = upper, known = known, noiseControl = noiseControl), 
              time = proc.time()[3] - tic)
  
  if(settings$return.Ki) res <- c(res, list(Ki = Ki))
  
  class(res) <- "homTP"
  return(res)
  
}

#' @method summary homTP
#' @export
summary.homTP <- function(object,...){
  ans <- object
  class(ans) <- "summary.homTP"
  ans
}

#' @export
print.summary.homTP <- function(x, ...){
  
  cat("N = ", length(x$Z), " n = ", length(x$Z0), " d = ", ncol(x$X0), "\n")
  cat(x$covtype, " covariance lengthscale values: ", x$theta, "\n")
  
  cat("Homoskedastic nugget value: ", x$g, "\n")
  
  cat("Variance/scale hyperparameter: ", x$sigma2, "\n")
  cat("Degree of freedom nu:", x$nu, "\n")
  
  cat("Given constant trend value: ", x$beta0, "\n")
  
  cat("MLE optimization: \n", "Log-likelihood = ", x$ll, "; Nb of evaluations (obj, gradient) by L-BFGS-B: ", x$nit_opt, "; message: ", x$msg, "\n")
}

#' @method print homTP
#' @export
print.homTP <- function(x, ...){
  print(summary(x))
  
}

#' @method plot homTP
#' @export
plot.homTP <- function(x, ...){
  LOOpreds <- LOO_preds(x)
  plot(x$Z, LOOpreds$mean[rep(1:nrow(x$X0), times = x$mult)], xlab = "Observed values", ylab = "Predicted values",
       main = "Leave-one-out predictions")
  arrows(x0 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qt(0.05, df = x$nu + nrow(x$X0)),
         x1 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qt(0.95, df = x$nu + nrow(x$X0)),
         y0 = LOOpreds$mean, length = 0, col = "blue")
  points(x$Z0[which(x$mult > 1)], LOOpreds$mean[which(x$mult > 1)], pch = 20, col = 2)
  abline(a = 0, b = 1, lty = 3)
  legend("topleft", pch = c(1, 20, NA), lty = c(NA, NA, 1), col = c(1, 2, 4),
         legend = c("observations", "averages (if > 1 observation)", "LOO prediction interval"))
}

#' Student-t process predictions using a homoskedastic noise GP object (of class \code{homGP})
#' @param x matrix of designs locations to predict at
#' @param object an object of class \code{homGP}; e.g., as returned by \code{\link[hetGP]{mleHomTP}}
#' @param xprime optional second matrix of predictive locations to obtain the predictive covariance matrix between \code{x} and \code{xprime}
#' @param ... no other argument for this method
#' @return list with elements
#' \itemize{
#' \item \code{mean}: kriging mean;
#' \item \code{sd2}: kriging variance (filtered, e.g. without the nugget value)
#' \item \code{cov}: predictive covariance matrix between \code{x} and \code{xprime}
#' \item \code{nugs}: nugget value at each prediction location
#' }
#' @importFrom MASS ginv
#' @details The full predictive variance corresponds to the sum of \code{sd2} and \code{nugs}.
#' @method predict homTP
#' @export
predict.homTP <- function(object, x, xprime = NULL, ...){
  
  if(is.null(dim(x))){
    x <- matrix(x, nrow = 1)
    if(ncol(x) != ncol(object$X0)) stop("x is not a matrix")
  }
  
  if(!is.null(xprime) && is.null(dim(xprime))){
    xprime <- matrix(xprime, nrow = 1)
    if(ncol(xprime) != ncol(object$X0)) stop("xprime is not a matrix")
  }
  
  if(is.null(object$Ki))
    object$Ki <- chol2inv(chol(object$sigma2 * cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype) + diag(object$g/object$mult + object$eps)))
  
  
  n1 <- length(object$Z)
  kx <- object$sigma2 * cov_gen(X1 = x, X2 = object$X0, theta = object$theta, type = object$covtype)
  
  mean <- as.vector(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0)))
  
  # if(object$trendtype == 'SK'){
  sd2 <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * as.vector(object$sigma2 - fast_diag(kx, tcrossprod(object$Ki, kx)))
  # }else{
  # sd2 <- as.vector(object$nu_hat - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
  # }
  
  ## In case of numerical errors, some sd2 values may become negative
  if(any(sd2 < 0)){
    # object$Ki <- ginv(add_diag(cov_gen(X1 = object$X, theta = object$theta, type = object$covtype), rep(object$g + object$eps, nrow(object$X0))))/object$sigma2
    # mean <- as.vector(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0)))
    # 
    # sd2 <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * as.vector(object$sigma2 - fast_diag(kx, tcrossprod(object$Ki, kx)))
    sd2 <- pmax(0, sd2)
    warning("Numerical errors caused some negative predictive variances to be thresholded to zero. Consider using ginv via rebuild.homTP")
  }
  
  if(!is.null(xprime)){
    kxprime <- object$sigma2 * cov_gen(X1 = object$X0, X2 = xprime, theta = object$theta, type = object$covtype)
    # if(object$trendtype == 'SK'){
    if(nrow(x) > nrow(xprime)){
      cov <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * (object$sigma2 * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% (object$Ki %*% kxprime))
    }else{
      cov <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * (object$sigma2 * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - (kx %*% object$Ki) %*% kxprime)
    }
    # }else{
    # cov <- object$sigma2 * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% tcrossprod(object$Ki, kxprime) + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - tcrossprod(rowSums(object$Ki), kxprime))/sum(object$Ki)
    # }
  }else{
    cov = NULL
  }
  
  return(list(mean = mean, sd2 = sd2, cov = cov, nugs = rep(object$g, length(mean))))
}


## ' Rebuild inverse covariance matrix of \code{homTP} (e.g., if exported without \code{Ki})
## ' @param object \code{homTP} model without slot \code{Ki} (inverse covariance matrix)
## ' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
#' @rdname ExpImp
#' @method rebuild homTP
#' @export
rebuild.homTP <- function(object, robust = FALSE){
  if(robust){
    object$Ki <- ginv(add_diag(cov_gen(X1 = object$X, theta = object$theta, type = object$covtype), rep(object$g + object$eps, nrow(object$X0))))/object$sigma2
  }else{
    object$Ki <- chol2inv(chol(object$sigma2 * cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype) + diag(object$g/object$mult + object$eps)))
  }
  
  return(object)
}

###############################################################################
## Part II: Heterogeneous TP with all options for the fit
###############################################################################


# dmt(Z, mean = rep(0,length(Z)), S = (model$nu - 2)/model$nu*(model$sigma2 * cov_gen(X, theta = model$theta, type = "Matern5_2") + diag(rep(model$Lambda, times = model$mult) + 1e-8)), df = model$nu,  log = T)
logLik_HetTP <- function(X0, Z0, Z, mult, Delta, theta, g, nu, sigma2, k_theta_g = NULL, theta_g = NULL, logN = TRUE,
                         beta0 = 0, eps = sqrt(.Machine$double.eps), covtype = "Gaussian",
                         penalty = T, hom_ll = NULL, env = NULL, trace = 0){
  n <- nrow(X0)
  N <- length(Z)
  
  if(is.null(theta_g))
    theta_g <- k_theta_g * theta
  
  Cg <- cov_gen(X1 = X0, theta = theta_g, type = covtype)
  Kg_c <- chol(Cg + diag(eps + g/mult))
  Kgi <- chol2inv(Kg_c)
  
  nmean <- drop(rowSums(Kgi) %*% Delta / sum(Kgi)) ## ordinary kriging mean
  
  M <- Cg %*% (Kgi %*% (Delta - nmean))
  
  
  Lambda <- drop(nmean + M)
  
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  
  LambdaN <- rep(Lambda, times = mult)
  
  C <- cov_gen(X1 = X0, theta = theta, type = covtype)
  Ki <- chol(add_diag(sigma2 * C, Lambda/mult + eps))
  ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
  Ki <- chol2inv(Ki)
  
  if(!is.null(env)){
    env$C <- C
    env$Cg <- Cg
    env$Kg_c <- Kg_c
    env$Kgi <- Kgi
    env$ldetKi <- ldetKi
    env$Ki <- Ki
  }
  
  psi_0 <- drop(crossprod(Z0 - beta0, Ki) %*% (Z0 - beta0))
  
  psi <- drop(crossprod((Z - beta0)/LambdaN, Z - beta0) - crossprod((Z0 - beta0) * mult/Lambda, Z0 - beta0) + psi_0)
  
  loglik <- -N/2 * log((nu - 2) * pi) + ldetKi/2 - 1/2 * sum((mult - 1) * log(Lambda) + log(mult)) + lgamma((nu + N)/2) - lgamma(nu/2) - (nu + N)/2 * log(1 + psi/(nu - 2))
  
  if(penalty){
    nu_hat_var <- drop(crossprod(Delta - nmean, Kgi) %*% (Delta - nmean))/length(Delta)
    
    ## To avoid 0 variance, e.g., when Delta = nmean
    if(nu_hat_var < eps) return(loglik)
    
    # if(hardpenalty)
    #   return(loglik + min(0, - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2))
    
    pen <- - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2
    
    if(loglik < hom_ll && pen > 0){
      if(trace> 0) warning("Penalty is desactivated when unpenalized likelihood is lower than its homTP equivalent")
      return(loglik)
    }
    
    return(loglik + pen)
  }
  return(loglik)
}

dlogLikHetTP <- function(X0, Z0, Z, mult, Delta, theta, nu, sigma2, g, k_theta_g = NULL, theta_g = NULL, beta0 = NULL, pX = NULL,
                         logN = TRUE, components = NULL, eps = sqrt(.Machine$double.eps), covtype = "Gaussian", 
                         penalty = T, hom_ll = NULL, env = NULL){
  
  ## Verifications
  if(is.null(k_theta_g) && is.null(theta_g))
    cat("Either k_theta_g or theta_g must be provided \n")
  
  ## Initialisations
  
  # Which terms need to be computed
  if(is.null(components)){
    components <- c(list("theta", "Delta", "g", "nu"))
    if(is.null(k_theta_g)){
      components <- c(components, list("theta_g"))
    }else{
      components <- c(components, list("k_theta_g"))
    }
    if(!is.null(pX))
      components <- c(components, list("pX"))
  }
  
  if(is.null(theta_g))
    theta_g <- k_theta_g * theta
  
  n <- nrow(X0)
  N <- length(Z)
  
  if(!is.null(env)){
    Cg <- env$Cg
    Kg_c <- env$Kg_c
    Kgi <- env$Kgi
  }else{
    Cg <- cov_gen(X1 = X0, theta = theta_g, type = covtype)
    Kg_c <- chol(Cg + diag(eps + g/mult))
    Kgi <- chol2inv(Kg_c)
  }

  # M <- Cg %*% Kgi
  M <- add_diag(Kgi * (-eps - g / mult), rep(1, n))
  
  ## Precomputations for reuse
  
  rSKgi <- rowSums(Kgi)
  sKgi <- sum(Kgi)
  
  nmean <- drop(rSKgi %*% Delta / sKgi) ## ordinary kriging mean
  
  ## Precomputations for reuse
  KgiD <- Kgi %*% (Delta - nmean)
  
  # if(penalty){
  #   nu_hat_var <- drop(crossprod(KgiD, (Delta - nmean)))/length(Delta) 
  #   # To prevent numerical issues when Delta = nmean, giving a positive penalty (or if the penalty is positive)
  #   if(nu_hat_var < eps || (hardpenalty && (- n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2) > 0)) penalty <- FALSE
  # }                                          
  
  Lambda <- drop(nmean + M %*% (Delta - nmean))
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  
  LambdaN <- rep(Lambda, times = mult)
  
  if(!is.null(env)){
    C <- env$C
    Ki <- env$Ki
    ldetKi <- env$ldetKi
  }else{
    C <- cov_gen(X1 = X0, theta = theta, type = covtype)
    Ki <- chol(sigma2 * C + diag(Lambda/mult + eps))
    ldetKi <- - 2 * sum(log(diag(Ki))) # log determinant from Cholesky
    Ki <- chol2inv(Ki)
  }

  if(is.null(beta0))
    beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  ## Precomputations for reuse
  KiZ0 <- Ki %*% (Z0 - beta0)
  rsM <- rowSums(M)
  
  psi_0 <- drop(crossprod(KiZ0, Z0 - beta0))
  
  psi <- drop(crossprod((Z - beta0)/LambdaN, Z - beta0) - crossprod((Z0 - beta0) * mult/Lambda, Z0 - beta0) + psi_0)
  
  if(penalty){
    nu_hat_var <- drop(crossprod(KgiD, (Delta - nmean)))/length(Delta) 
    
    # To prevent numerical issues when Delta = nmean, resulting in divisions by zero
    if(nu_hat_var < eps){
      penalty <- FALSE
    }else{
      loglik <- -N/2 * log(2*pi) - N/2 * log(psi/N)  + 1/2 * ldetKi - 1/2 * sum((mult - 1) * log(Lambda) + log(mult)) - N/2
      pen <- - n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2
      if(loglik < hom_ll && pen > 0) penalty <- FALSE
    }
    # if(nu_hat_var < eps || (hardpenalty && (- n/2 * log(nu_hat_var) - sum(log(diag(Kg_c))) - n/2*log(2*pi) - n/2) > 0)) penalty <- FALSE
  }   
  
  dLogL_dtheta <- dLogL_dDelta <- dLogL_dkthetag <- dLogL_dthetag <- dLogL_dg <- dLogL_dnu <- dLogL_dsigma2 <- NULL
  
  # First component, derivative of logL with respect to theta
  if("theta" %in% components){
    dLogL_dtheta <- rep(NA, length(theta))
    for(i in 1:length(theta)){
      if(length(theta) == 1){
        dC_dthetak <- partial_cov_gen(X1 = X0, theta = theta, arg = "theta_k", type = covtype) * C # partial derivative of C with respect to theta
      }else{
        dC_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = theta[i], arg = "theta_k", type = covtype) * C # partial derivative of C with respect to theta
      }
      
      if("k_theta_g" %in% components){
        if(length(theta) == 1){
          dCg_dthetak <- partial_cov_gen(X1 = X0, theta = k_theta_g * theta, arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
        }else{
          dCg_dthetak <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = k_theta_g * theta[i], arg = "theta_k", type = covtype) * k_theta_g * Cg # partial derivative of Cg with respect to theta[i]
        }
        
        # Derivative Lambda / theta_k (first part)
        dLdtk <- dCg_dthetak %*% KgiD - M %*% (dCg_dthetak %*% KgiD)
        
        # (second part)
        dLdtk <- dLdtk - (1 - rsM) * drop(rSKgi %*% dCg_dthetak %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetak %*% rSKgi))/sKgi^2
        
        if(logN)
          dLdtk <- dLdtk * Lambda
        
        dK_dthetak <- add_diag(sigma2 * dC_dthetak, drop(dLdtk)/mult) # dK/dtheta[k]
        dLogL_dtheta[i] <- (nu + N)/(2 * (nu + psi - 2)) * (crossprod((Z - beta0)/LambdaN * rep(dLdtk, times = mult), (Z - beta0)/LambdaN) - crossprod((Z0 - beta0)/Lambda * mult * dLdtk, (Z0 - beta0)/Lambda) +
                                                              crossprod(KiZ0, dK_dthetak) %*% KiZ0) - 1/2 * trace_sym(Ki, dK_dthetak)
        dLogL_dtheta[i] <- dLogL_dtheta[i] - 1/2 * sum((mult - 1) * dLdtk/Lambda) # derivative of the sum(a_i - 1)log(lambda_i)
        
        if(penalty)
          dLogL_dtheta[i] <- dLogL_dtheta[i]  + 1/2 * crossprod(KgiD, dCg_dthetak) %*% KgiD / nu_hat_var  - 1/2 * trace_sym(Kgi, dCg_dthetak)
        
      }else{
        dLogL_dtheta[i] <- sigma2 * (nu + N)/(2 * (nu + psi - 2)) * crossprod(KiZ0, dC_dthetak) %*% KiZ0  - sigma2/2 * trace_sym(Ki, dC_dthetak)
      }
    }
  }
  
  # Derivative of logL with respect to Lambda
  if(any(c("Delta", "g", "k_theta_g", "theta_g", "pX") %in% components)){
    dLogLdLambda <- (nu + N)/(2 * (nu + psi - 2)) * ((fast_tUY2(mult, (Z - beta0)^2) - (Z0 - beta0)^2 * mult)/Lambda^2 + KiZ0^2/mult) - (mult - 1)/(2*Lambda) - 1/(2*mult) * diag(Ki)
    
    if(logN){
      dLogLdLambda <- Lambda * dLogLdLambda
    }
  }
  
  
  ## Derivative of Lambda with respect to Delta
  if("Delta" %in% components)
    # dLogL_dDelta <- crossprod(M - tcrossprod(M, matrix(rep(rSKgi / sKgi, ncol(Kgi)), ncol(Kgi))) + matrix(rep(rSKgi / sKgi, nrow(M)), nrow(M), byrow = T), dLogLdLambda) #chain rule
    dLogL_dDelta <- crossprod(M, dLogLdLambda) + rSKgi/sKgi*sum(dLogLdLambda) -  rSKgi / sKgi * sum(crossprod(M, dLogLdLambda)) #chain rule
  
  # Derivative Lambda / k_theta_g
  if("k_theta_g" %in% components){
    dCg_dk <- partial_cov_gen(X1 = X0, theta = theta, k_theta_g = k_theta_g, arg = "k_theta_g", type = covtype) * Cg
    
    dLogL_dkthetag <- dCg_dk %*% KgiD - M %*% (dCg_dk %*% KgiD) -
      (1 - rsM) * drop(rSKgi %*% dCg_dk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dk %*% rSKgi))/sKgi^2
    
    dLogL_dkthetag <- crossprod(dLogL_dkthetag, dLogLdLambda) ## chain rule
  }
  
  # Derivative Lambda / theta_g
  if("theta_g" %in% components){
    dLogL_dthetag <- rep(NA, length(theta_g))
    
    for(i in 1:length(theta_g)){
      
      if(length(theta_g) == 1){
        dCg_dthetagk <- partial_cov_gen(X1 = X0, theta = theta_g, arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
      }else{
        dCg_dthetagk <- partial_cov_gen(X1 = X0[, i, drop = FALSE], theta = theta_g[i], arg = "theta_k", type = covtype) * Cg # partial derivative of Cg with respect to theta
      }
      
      dLogL_dthetag[i] <- crossprod(dCg_dthetagk %*% KgiD - M %*% (dCg_dthetagk %*% KgiD) -
                                      (1 - rsM) * drop(rSKgi %*% dCg_dthetagk %*% (Kgi %*% Delta) * sKgi - rSKgi %*% Delta * (rSKgi %*% dCg_dthetagk %*% rSKgi))/sKgi^2, dLogLdLambda) #chain rule
      # Penalty term
      if(penalty) dLogL_dthetag[i] <- dLogL_dthetag[i] + 1/2 * crossprod(KgiD, dCg_dthetagk) %*% KgiD/nu_hat_var - trace_sym(Kgi, dCg_dthetagk)/2 
    }
  }
  
  if("sigma2" %in% components) 
    dLogL_dsigma2 <- (nu + N)/(2 * (nu + psi - 2)) * crossprod(KiZ0, C) %*% KiZ0 -1/2 * trace_sym(Ki, C)
  
  if("nu" %in% components) 
    dLogL_dnu <- -N / (2*(nu - 2)) + 1/2*digamma((nu + N)/2) - 1/2*digamma(nu/2) - 1/2 * log(1 + psi/(nu - 2)) + psi * (nu + N)/(2*(nu - 2)^2 + 2 * psi * (nu - 2))
  
  ## Derivative Lambda / g
  if("g" %in% components){
    dLogL_dg <- crossprod(-M %*% (KgiD/mult) - (1 - rsM) * drop(Delta %*% (Kgi %*% (rSKgi/mult)) * sKgi - rSKgi %*% Delta * sum(rSKgi^2/mult))/sKgi^2, dLogLdLambda) #chain rule
  }
  
  
  # Additional penalty terms on Delta
  if(penalty){
    if("Delta" %in% components){
      dLogL_dDelta <- dLogL_dDelta - KgiD / nu_hat_var
    }
    if("k_theta_g" %in% components){
      dLogL_dkthetag <- dLogL_dkthetag + 1/2 * crossprod(KgiD, dCg_dk) %*% KgiD / nu_hat_var - trace_sym(Kgi, dCg_dk)/2 
    }
    if("g" %in% components){
      dLogL_dg <- dLogL_dg + 1/2 * crossprod(KgiD/mult, KgiD) / nu_hat_var - sum(diag(Kgi)/mult)/2 
    }
    
  }
  
  return(c(dLogL_dtheta,
           dLogL_dDelta,
           dLogL_dkthetag,
           dLogL_dthetag,
           dLogL_dsigma2,
           dLogL_dnu,
           dLogL_dg))
  
}


#' @title Student-t process modeling with heteroskedastic noise
#' @description 
#' Student-t process regression under input dependent noise based on maximum likelihood estimation of the hyperparameters.
#' A GP is used to model latent (log-) variances.
#' This function is enhanced to deal with replicated observations.
#' @param X matrix of all designs, one per row, or list with elements:
#' \itemize{
#'   \item \code{X0} matrix of unique design locations, one point per row
#'   \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
#'   \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
#' } 
#' @param Z vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}
#' @param lower,upper bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
#' In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy)
#' @param noiseControl list with elements related to optimization of the noise process parameters:
#' \itemize{
#' \item \code{g_min}, \code{g_max} minimal and maximal noise to signal ratio (of the mean process)
#' \item \code{lowerDelta}, \code{upperDelta} optional vectors (or scalars) of bounds on \code{Delta}, of length \code{nrow(X0)} (default to \code{rep(eps, nrow(X0))} and \code{rep(noiseControl$g_max, nrow(X0))} resp., or their \code{log}) 
## ' \item lowerpX, upperpX optional vectors of bounds of the input domain if pX is used.
#' \item \code{lowerTheta_g}, \code{upperTheta_g} optional vectors of bounds for the lengthscales of the noise process if \code{linkThetas == 'none'}.
#' Same as for \code{theta} if not provided.
#' \item \code{k_theta_g_bounds} if \code{linkThetas == 'joint'}, vector with minimal and maximal values for \code{k_theta_g} (default to \code{c(1, 100)}). See Details.
#' \item \code{g_bounds} vector for minimal and maximal noise to signal ratios for the noise of the noise process, i.e., the smoothing parameter for the noise process.
#' (default to \code{c(1e-6, 1)}).
#' \item \code{sigma2_bounds}, vector providing minimal and maximal signal variance.
#' \item \code{nu_bounds}, vector providing minimal and maximal values for the degrees of freedom. 
#'}
#' @param settings list for options about the general modeling procedure, with elements:
#' \itemize{
#'   \item \code{linkThetas} defines the relation between lengthscales of the mean and noise processes.
#'   Either \code{'none'}, \code{'joint'}(default) or \code{'constr'}, see Details.
#'   \item \code{logN}, when \code{TRUE} (default), the log-noise process is modeled.
#'   \item \code{initStrategy} one of \code{'simple'}, \code{'residuals'} (default) and \code{'smoothed'} to obtain starting values for \code{Delta}, see Details
#'   \item \code{penalty} when \code{TRUE}, the penalized version of the likelihood is used (i.e., the sum of the log-likelihoods of the mean and variance processes, see References).
## '   \item \code{hardpenalty} is \code{TRUE}, the log-likelihood from the noise GP is taken into account only if negative.
#'   \item \code{checkHom} when \code{TRUE}, if the log-likelihood with a homoskedastic model is better, then return it.
#'   \item \code{trace} optional scalar (default to \code{0}). If positive, tracing information on the fitting process.
#' If \code{1}, information is given about the result of the heterogeneous model optimization.
#' Level \code{2} gives more details. Level \code{3} additionaly displays all details about initialization of hyperparameters.
#' \item \code{return.matrices} boolean too include the inverse covariance matrix in the object for further use (e.g., prediction).
#' \item Arguments \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}}.   
#' }
#' @param eps jitter used in the inversion of the covariance matrix for numerical stability
#' @param init,known optional lists of starting values for mle optimization or that should not be optimized over, respectively.
#' Values in \code{known} are not modified, while it can happen to those of \code{init}, see Details. 
#' One can set one or several of the following:
#' \itemize{
#' \item \code{theta} lengthscale parameter(s) for the mean process either one value (isotropic) or a vector (anistropic)
#' \item \code{Delta} vector of nuggets corresponding to each design in \code{X0}, that are smoothed to give \code{Lambda}
#' (as the global covariance matrix depend on \code{Delta} and \code{nu_hat}, it is recommended to also pass values for \code{theta})
#' \item \code{beta0} constant trend of the mean process
#' \item \code{k_theta_g} constant used for link mean and noise processes lengthscales, when \code{settings$linkThetas == 'joint'}
#' \item \code{theta_g} either one value (isotropic) or a vector (anistropic) for lengthscale parameter(s) of the noise process, when \code{settings$linkThetas != 'joint'}
#' \item \code{g} scalar nugget of the noise process
#' \item \code{nu} degree of freedom parameter
#' \item \code{sigma2} scale variance
#' \item \code{g_H} scalar homoskedastic nugget for the initialisation with a \code{\link[hetGP]{mleHomGP}}. See Details.
## '\item pX matrix of fixed pseudo inputs locations of the noise process corresponding to Delta
#' }
#' @param covtype covariance kernel type, either \code{'Gaussian'}, \code{'Matern5_2'} or \code{'Matern3_2'}, see \code{\link[hetGP]{cov_gen}}
#' @param maxit maximum number of iterations for \code{L-BFGS-B} of \code{\link[stats]{optim}} dedicated to maximum likelihood optimization
#' @details
#' 
#' The global covariance matrix of the model is parameterized as \code{K = sigma2 * C + Lambda * diag(1/mult)},
#' with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices).
#' \code{Lambda} is the prediction on the noise level given by a (homoskedastic) GP: \cr
#' \deqn{\Lambda = C_g(C_g + \mathrm{diag}(g/\mathrm{mult}))^{-1} \Delta} \cr
#' with \code{C_g} the correlation matrix between unique designs for this second GP, with lengthscales hyperparameters \code{theta_g} and nugget \code{g}
#' and \code{Delta} the variance level at \code{X0} that are estimated.
#' 
#' It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.
#' 
#' The noise process lengthscales can be set in several ways:
#' \itemize{
#' \item using \code{k_theta_g} (\code{settings$linkThetas == 'joint'}), supposed to be greater than one by default. 
#' In this case lengthscales of the noise process are multiples of those of the mean process.
#' \item if \code{settings$linkThetas == 'constr'}, then the lower bound on \code{theta_g} correspond to estimated values of an homoskedastic GP fit.
#' \item else lengthscales between the mean and noise process are independent (both either anisotropic or not).
#' }
#'
#' When no starting nor fixed parameter values are provided with \code{init} or \code{known}, 
#' the initialization process consists of fitting first an homoskedastic model of the data, called \code{modHom}.
#' Unless provided with \code{init$theta}, initial lengthscales are taken at 10\% of the range determined with \code{lower} and \code{upper},
#' while \code{init$g_H} may be use to pass an initial nugget value.
#' The resulting lengthscales provide initial values for \code{theta} (or update them if given in \code{init}). \cr \cr
#' If necessary, a second homoskedastic model, \code{modNugs}, is fitted to the empirical residual variance between the prediction
#'  given by \code{modHom} at \code{X0} and \code{Z} (up to \code{modHom$nu_hat}).
#' Note that when specifying \code{settings$linkThetas == 'joint'}, then this second homoskedastic model has fixed lengthscale parameters.
#' Starting values for \code{theta_g} and \code{g} are extracted from \code{modNugs}.\cr \cr
#' Finally, three initialization schemes for \code{Delta} are available with \code{settings$initStrategy}: 
#' \itemize{
#' \item for \code{settings$initStrategy == 'simple'}, \code{Delta} is simply initialized to the estimated \code{g} value of \code{modHom}. 
#' Note that this procedure may fail when \code{settings$penalty == TRUE}.
#' \item for \code{settings$initStrategy == 'residuals'}, \code{Delta} is initialized to the estimated residual variance from the homoskedastic mean prediction.
#' \item for \code{settings$initStrategy == 'smoothed'}, \code{Delta} takes the values predicted by \code{modNugs} at \code{X0}.
#' }
#'
#' Notice that \code{lower} and \code{upper} bounds cannot be equal for \code{\link[stats]{optim}}.
#'
#' @return a list which is given the S3 class \code{"hetTP"}, with elements:
#' \itemize{
#' \item \code{theta}: unless given, maximum likelihood estimate (mle) of the lengthscale parameter(s),
#' \item \code{Delta}: unless given, mle of the nugget vector (non-smoothed),
#' \item \code{Lambda}: predicted input noise variance at \code{X0}, 
#' \item \code{sigma2}: plugin estimator of the variance,
#' \item \code{theta_g}: unless given, mle of the lengthscale(s) of the noise/log-noise process,
#' \item \code{k_theta_g}: if \code{settings$linkThetas == 'joint'}, mle for the constant by which lengthscale parameters of \code{theta} are multiplied to get \code{theta_g},
#' \item \code{g}: unless given, mle of the nugget of the noise/log-noise process,
#' \item \code{trendtype}: either "\code{SK}" if \code{beta0} is provided, else "\code{OK}",
#' \item \code{beta0} constant trend of the mean process, plugin-estimator unless given,
#' \item \code{nmean}: plugin estimator for the constant noise/log-noise process mean,
## ' \item \code{pX}: if used, matrix of pseudo-inputs locations for the noise/log-noise process,
#' \item \code{ll}: log-likelihood value, (\code{ll_non_pen}) is the value without the penalty,
#' \item \code{nit_opt}, \code{msg}: \code{counts} and \code{message} returned by \code{\link[stats]{optim}}
#' \item \code{modHom}: homoskedastic GP model of class \code{homGP} used for initialization of the mean process,
#' \item \code{modNugs}: homoskedastic GP model of class \code{homGP} used for initialization of the noise/log-noise process,
#' \item \code{nu_hat_var}: variance of the noise process,
#' \item \code{used_args}: list with arguments provided in the call to the function, which is saved in \code{call},
#' \item \code{X0}, \code{Z0}, \code{Z}, \code{eps}, \code{logN}, \code{covtype}: values given in input,
#' \item \code{time}: time to train the model, in seconds.
#'}
#' @seealso \code{\link[hetGP]{predict.hetTP}} for predictions. 
#' \code{summary} and \code{plot} functions are available as well. 
#' @references 
#' M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
#' Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
#' Preprint available on arXiv:1611.05902.\cr \cr
#'  
#' A. Shah, A. Wilson, Z. Ghahramani (2014), Student-t processes as alternatives to Gaussian processes, Artificial Intelligence and Statistics, 877--885.
#' 
#' @export
#' @importFrom stats optim var
#' @import methods
#' @examples 
#' ##------------------------------------------------------------
#' ## Example 1: Heteroskedastic TP modeling on the motorcycle data
#' ##------------------------------------------------------------
#' set.seed(32)
#' 
#' ## motorcycle data
#' library(MASS)
#' X <- matrix(mcycle$times, ncol = 1)
#' Z <- mcycle$accel
#' nvar <- 1
#' plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")
#' 
#' 
#' ## Model fitting
#' model <- mleHetTP(X = X, Z = Z, lower = rep(0.1, nvar), upper = rep(50, nvar),
#'                   covtype = "Matern5_2")
#'             
#' ## Display averaged observations
#' points(model$X0, model$Z0, pch = 20)
#' 
#' ## A quick view of the fit                  
#' summary(model)
#' 
#' ## Create a prediction grid and obtain predictions
#' xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
#' preds <- predict(x = xgrid, object =  model)
#' 
#' ## Display mean predictive surface
#' lines(xgrid, preds$mean, col = 'red', lwd = 2)
#' ## Display 95% confidence intervals
#' lines(xgrid, preds$mean + sqrt(preds$sd2) * qt(0.05, df = model$nu + nrow(X)), col = 2, lty = 2)
#' lines(xgrid, preds$mean + sqrt(preds$sd2) * qt(0.95, df = model$nu + nrow(X)), col = 2, lty = 2)
#' ## Display 95% prediction intervals
#' lines(xgrid, preds$mean + sqrt(preds$sd2 + preds$nugs) * qt(0.05, df = model$nu + nrow(X)),
#'   col = 3, lty = 2)
#' lines(xgrid, preds$mean + sqrt(preds$sd2 + preds$nugs) * qt(0.95, df = model$nu + nrow(X)), 
#'   col = 3, lty = 2)
#' 
#' ##------------------------------------------------------------
#' ## Example 2: 2D Heteroskedastic TP modeling
#' ##------------------------------------------------------------
#' set.seed(1)
#' nvar <- 2
#'   
#' ## Branin redefined in [0,1]^2
#' branin <- function(x){
#'   if(is.null(nrow(x)))
#'     x <- matrix(x, nrow = 1)
#'     x1 <- x[,1] * 15 - 5
#'     x2 <- x[,2] * 15
#'     (x2 - 5/(4 * pi^2) * (x1^2) + 5/pi * x1 - 6)^2 + 10 * (1 - 1/(8 * pi)) * cos(x1) + 10
#' }
#' 
#' ## Noise field via standard deviation
#' noiseFun <- function(x){
#'   if(is.null(nrow(x)))
#'     x <- matrix(x, nrow = 1)
#'   return(1/5*(3*(2 + 2*sin(x[,1]*pi)*cos(x[,2]*3*pi) + 5*rowSums(x^2))))
#' }
#' 
#' ## data generating function combining mean and noise fields
#' ftest <- function(x){
#'   return(branin(x) + rnorm(nrow(x), mean = 0, sd = noiseFun(x)))
#' }
#' 
#' ## Grid of predictive locations
#' ngrid <- 51
#' xgrid <- matrix(seq(0, 1, length.out = ngrid), ncol = 1) 
#' Xgrid <- as.matrix(expand.grid(xgrid, xgrid))
#' 
#' ## Unique (randomly chosen) design locations
#' n <- 100
#' Xu <- matrix(runif(n * 2), n)
#' 
#' ## Select replication sites randomly
#' X <- Xu[sample(1:n, 20*n, replace = TRUE),]
#' 
#' ## obtain training data response at design locations X
#' Z <- ftest(X)
#' 
#' ## Formating of data for model creation (find replicated observations) 
#' prdata <- find_reps(X, Z, rescale = FALSE, normalize = FALSE)
#'
#' ## Model fitting
#' model <- mleHetTP(X = list(X0 = prdata$X0, Z0 = prdata$Z0, mult = prdata$mult), Z = prdata$Z, ,
#'                   lower = rep(0.01, nvar), upper = rep(10, nvar),
#'                   covtype = "Matern5_2")
#'
#' ## a quick view into the data stored in the "hetTP"-class object
#' summary(model)                  
#'              
#' ## prediction from the fit on the grid     
#' preds <- predict(x = Xgrid, object =  model)
#' 
#' ## Visualization of the predictive surface
#' par(mfrow = c(2, 2))
#' contour(x = xgrid,  y = xgrid, z = matrix(branin(Xgrid), ngrid), 
#'   main = "Branin function", nlevels = 20)
#' points(X, col = 'blue', pch = 20)
#' contour(x = xgrid,  y = xgrid, z = matrix(preds$mean, ngrid), 
#'   main = "Predicted mean", nlevels = 20)
#' points(X, col = 'blue', pch = 20)
#' contour(x = xgrid,  y = xgrid, z = matrix(noiseFun(Xgrid), ngrid), 
#'   main = "Noise standard deviation function", nlevels = 20)
#' points(X, col = 'blue', pch = 20)
#' contour(x = xgrid,  y= xgrid, z = matrix(sqrt(preds$nugs), ngrid), 
#'   main = "Predicted noise values", nlevels = 20)
#' points(X, col = 'blue', pch = 20)
#' par(mfrow = c(1, 1))
##
mleHetTP <- function(X, Z, lower = NULL, upper = NULL,
                     noiseControl = list(k_theta_g_bounds = c(1, 100), g_max = 1e4, g_bounds = c(1e-6, 0.1),
                                         nu_bounds = c(2 + 1e-3, 30), sigma2_bounds = c(sqrt(.Machine$double.eps), 1e4)),
                     settings = list(linkThetas = 'joint', logN = TRUE, initStrategy = 'residuals', checkHom = TRUE,
                                     penalty = TRUE, trace = 0, return.matrices = TRUE, return.hom = FALSE, factr = 1e9), 
                     covtype = c("Gaussian", "Matern5_2", "Matern3_2"), maxit = 100, known = list(beta0 = 0),
                     init = list(nu = 3), eps = sqrt(.Machine$double.eps)){
  
  if(typeof(X)=="list"){
    X0 <- X$X0
    Z0 <- X$Z0
    mult <- X$mult
    if(sum(mult) != length(Z)) stop("Length(Z) should be equal to sum(mult)")
    if(is.null(dim(X0))) X0 <- matrix(X0, ncol = 1)
    if(length(Z0) != nrow(X0)) stop("Dimension mismatch between Z0 and X0")
  }else{
    if(is.null(dim(X))) X <- matrix(X, ncol = 1)
    if(nrow(X) != length(Z)) stop("Dimension mismatch between Z and X")
    elem <- find_reps(X, Z, return.Zlist = F)
    X0 <- elem$X0
    Z0 <- elem$Z0
    Z <- elem$Z
    mult <- elem$mult
  }
  
  covtype <- match.arg(covtype)
  
  if(is.null(lower) || is.null(upper)){
    auto_thetas <- auto_bounds(X = X0, covtype = covtype)
    if(is.null(lower)) lower <- auto_thetas$lower
    if(is.null(upper)) upper <- auto_thetas$upper
  }
  
  if(length(lower) != length(upper)) stop("upper and lower should have the same size")
  
  ## Save time to train model
  tic <- proc.time()[3]
  
  ## Initial checks
  
  n <- nrow(X0)
  
  if(is.null(n))
    stop("X0 should be a matrix. \n")

  jointThetas <- constrThetas <- FALSE
  
  if(is.null(settings$linkThetas)){
    jointThetas <- TRUE
  }else{
    if(settings$linkThetas == 'joint'){
      jointThetas <- TRUE
    }else{
      if(settings$linkThetas == 'constr')
        constrThetas <- TRUE
    }
  }
  
  logN <- TRUE
  if(!is.null(settings$logN)){
    logN <- settings$logN
  }
  
  if(is.null(settings$return.matrices))
    settings$return.matrices <- TRUE
  
  if(is.null(settings$return.hom))
    settings$return.hom <- FALSE
  
  if(jointThetas && is.null(noiseControl$k_theta_g_bounds))
    noiseControl$k_theta_g_bounds <- c(1, 100)
  
  if(is.null(settings$initStrategy))
    settings$initStrategy <- 'residuals'
  
  if(is.null(settings$factr))
    settings$factr <- 1e9
  
  penalty <- TRUE
  if(!is.null(settings$penalty))
    penalty <- settings$penalty
  
  # hardpenalty <- FALSE
  # if(!is.null(settings$hardpenalty))
  #   hardpenalty <- settings$hardpenalty
  
  if(is.null(settings$checkHom))
    settings$checkHom <- TRUE
  
  trace <- 0
  if(!is.null(settings$trace))
    trace <- settings$trace
  
  components <- NULL
  if(is.null(known$theta)){
    components <- c(components, list("theta"))
  }else{
    init$theta <- known$theta
  }
  
  if(is.null(known$Delta)){
    components <- c(components, list("Delta"))
  }else{
    init$Delta <- known$Delta
  }
  
  if(jointThetas && is.null(known$k_theta_g)){
    components <- c(components, list("k_theta_g"))
    
  }else{
    init$k_theta_g <- known$k_theta_g
  }
  
  if(!jointThetas && is.null(known$theta_g)){
    components <- c(components, list("theta_g"))
  }else{
    init$theta_g <- known$theta_g
  }
  
  if(is.null(known$sigma2)){
    components <- c(components, list("sigma2"))
    if(is.null(init$sigma2)) init$sigma2 <- var(Z0)
  }else{
    init$sigma2 <- known$sigma2
  }
  
  if(is.null(known$nu)){
    components <- c(components, list("nu"))
    if(is.null(init$nu)) init$nu <- 3
  }else{
    init$nu <- known$nu
  }
  
  if(is.null(known$g)){
    components <- c(components, list("g"))
  }else{
    init$g <- known$g
  }
  
  if(!is.null(init$pX)){
    components <- c(components, list("pX"))
    idcs_pX <- which(duplicated(rbind(init$pX, X0))) - nrow(init$pX) ## Indices of starting points in pX
  }else{
    if(!is.null(known$pX)){
      init$pX <- known$pX
    }else{
      init$pX <- NULL
    }
  }
  
  if(penalty && "pX" %in% components){
    penalty <- FALSE
    warning("Penalty not available with pseudo-inputs for now")
  }
  
  trendtype <- 'SK'
  if(is.null(known$beta0))  known$beta0 <- 0
  beta0 <- known$beta0
  
  if(is.null(components) && is.null(known$theta_g)){
    known$theta_g <- known$k_theta_g * known$theta
  }
  
  if(is.null(noiseControl$g_bounds)){
    noiseControl$g_bounds <- c(1e-6, 0.1)
  }
  
  if(is.null(noiseControl$nu_bounds)){
    noiseControl$nu_bounds <- c(2 + 1e-3, 30)
  }
  
  if(is.null(noiseControl$sigma2_bounds)){
    noiseControl$sigma2_bounds <- c(sqrt(.Machine$double.eps), 1e4)
  }
  
  # ## Advanced option Single Nugget Kriging model for the noise process
  # if(!is.null(noiseControl$SiNK) && noiseControl$SiNK){
  #   SiNK <- TRUE
  #   if(is.null(noiseControl$SiNK_eps)){
  #     SiNK_eps <- 1e-4
  #   }else{
  #     SiNK_eps <- noiseControl$SiNK_eps
  #   }
  # }else{
  #   SiNK <- FALSE
  #   SiNK_eps <- 1e-4
  # } 
  
  
  ### Automatic Initialisation
  modHom <- modNugs <- NULL
  if(is.null(init$theta) || is.null(init$Delta) || is.null(init$sigma2)){
    ## A) homoskedastic mean process
    
    if(!is.null(known$g_H)){
      g_init <- NULL
      # noiseControl$g_max <- known$g_H + eps
      # noiseControl$g_min <- known$g_H
    }else{
      g_init <- init$g_H
      ## Initial value for g of the homoskedastic process: based on the mean variance at replicates compared to the variance of Z0
      if(any(mult > 5)){
        mean_var_replicates <- mean((fast_tUY2(mult, (Z - rep(Z0, times = mult))^2)/mult)[which(mult > 5)])
        
        if(is.null(g_init)) g_init <- mean_var_replicates 
        if(is.null(noiseControl$g_max))
          noiseControl$g_max <- max(1e2, 100 * g_init)
        if(is.null(noiseControl$g_min))
          noiseControl$g_min <- eps
        
      }else{
        if(is.null(g_init)) g_init <- 0.05
        
        if(is.null(noiseControl$g_max))
          noiseControl$g_max <- 1e2
        
        if(is.null(noiseControl$g_min))
          noiseControl$g_min <- eps
        
      }
      
    }
    
    if(is.null(init$theta))
      init$theta <- sqrt(upper * lower)
    
    if(settings$checkHom){
      rKI <- TRUE
    }else{
      rKI <- FALSE
    } 
    modHom <- mleHomTP(X = list(X0 = X0, Z0 = Z0, mult = mult), Z = Z, lower = lower,
                       known = list(theta = known$theta, g = known$g_H, nu = known$nu, sigma2 = known$sigma2, beta0 = beta0),
                       init = list(theta = init$theta, g = g_init, nu = init$nu, sigma2 = init$sigma2),
                       upper = upper, covtype = covtype, maxit = maxit,
                       noiseControl = list(g_bounds = c(noiseControl$g_min, noiseControl$g_max),
                                           nu_bounds = noiseControl$nu_bounds), eps = eps, 
                       settings = list(return.Ki = rKI, factr = settings$factr, pgtol = settings$pgtol))
    
    ## Update init values
    if(is.null(known$theta)) init$theta <- modHom$theta
    
    if(is.null(known$sigma2)) init$sigma2 <- modHom$sigma2
    
    if(is.null(known$nu) && is.null(init$nu)) init$nu <- modHom$nu # Estimating nu may be hard, and even more for homTP
    
    if(is.null(init$Delta)){
      predHom <- suppressWarnings(predict(x = X0, object = modHom)$mean)
      nugs_est <- (Z - rep(predHom, times = mult))^2 #squared deviation from the homoskedastic prediction mean to the actual observations

      if(logN){
        nugs_est <- log(nugs_est)
      }
      
      nugs_est0 <- drop(fast_tUY2(mult, nugs_est))/mult # average
      
    }else{
      nugs_est0 <- init$Delta
    }
    
    if(constrThetas){
      noiseControl$lowerTheta_g <- modHom$theta
    }
    
    if(settings$initStrategy == 'simple'){
      if(logN){
        init$Delta <- rep(log(modHom$g), nrow(X0))
      }else{
        init$Delta <- rep(modHom$g, nrow(X0))
      }
    }
    if(settings$initStrategy == 'residuals')
      init$Delta <- nugs_est0
    
  }
  
  
  if((is.null(init$theta_g) && is.null(init$k_theta_g)) || is.null(init$g)){
    
    ## B) Homegeneous noise process
    if(jointThetas){
      if(is.null(init$k_theta_g)){
        init$k_theta_g <- 1
        init$theta_g <- init$theta
      }else{
        init$theta_g <- init$k_theta_g * init$theta
      }
      
      if(is.null(noiseControl$lowerTheta_g)){
        noiseControl$lowerTheta_g <- init$theta_g - eps
      }
      
      if(is.null(noiseControl$upperTheta_g)){
        noiseControl$upperTheta_g <- init$theta_g + eps
      }
    }
    
    if(!jointThetas && is.null(init$theta_g)){
      init$theta_g <- init$theta
    }
    
    if(is.null(noiseControl$lowerTheta_g)){
      noiseControl$lowerTheta_g <- lower
    }
    
    if(is.null(noiseControl$upperTheta_g)){
      noiseControl$upperTheta_g <- upper
    }
    
    ## If an homegeneous process of the mean has already been computed, it is used for estimating the parameters of the noise process
    if(exists("nugs_est")){
      
      if(is.null(init$g)){
        mean_var_replicates_nugs <- mean((fast_tUY2(mult, (nugs_est - rep(nugs_est0, times = mult))^2)/mult))
        init$g <- mean_var_replicates_nugs
      }
      
      modNugs <- mleHomGP(X = list(X0 = X0, Z0 = nugs_est0, mult = mult), Z = nugs_est,
                          lower = noiseControl$lowerTheta_g, upper = noiseControl$upperTheta_g,
                          init = list(theta = init$theta_g, g =  init$g), covtype = covtype, noiseControl = noiseControl,
                          maxit = maxit, eps = eps, settings = list(return.Ki = F, factr = settings$factr, pgtol = settings$pgtol))
      prednugs <- suppressWarnings(predict(x = X0, object = modNugs))
      
    }else{
      if(!exists("nugs_est0")) nugs_est0 <- init$Delta
      
      if(is.null(init$g)){
        init$g <- 0.05 * var(nugs_est0)
      }
      
      modNugs <- mleHomGP(X = list(X0 = X0, Z0 = nugs_est0, mult = rep(1, nrow(X0))),
                          Z = nugs_est0,
                          lower = noiseControl$lowerTheta_g, upper = noiseControl$upperTheta_g,
                          init = list(theta = init$theta_g, g =  init$g), covtype = covtype, noiseControl = noiseControl,
                          maxit = maxit, eps = eps, settings = list(return.Ki = F, factr = settings$factr, pgtol = settings$pgtol))
      prednugs <- suppressWarnings(predict(x = X0, object = modNugs))
    }
    
    if(settings$initStrategy == 'simple'){
      if(logN){
        init$Delta <- rep(log(modHom$g), nrow(X0))
      }else{
        init$Delta <- rep(modHom$g, nrow(X0))
      }
    }
    if(settings$initStrategy == 'residuals')
      init$Delta <- nugs_est0
    if(settings$initStrategy == 'smoothed')
      init$Delta <- prednugs$mean
    
    if(is.null(known$g))
      init$g <- modNugs$g
    
    if(jointThetas && is.null(init$k_theta_g))
      init$k_theta_g <- 1
    
    if(!jointThetas && is.null(init$theta_g))
      init$theta_g <- modNugs$theta
    
  }
  
  if(is.null(noiseControl$lowerTheta_g)){
    noiseControl$lowerTheta_g <- lower
  }
  
  if(is.null(noiseControl$upperTheta_g)){
    noiseControl$upperTheta_g <- upper
  }
  
  ### Start of optimization of the log-likelihood
  fn <- function(par, X0, Z0, Z, mult, Delta = NULL, theta = NULL, g = NULL, k_theta_g = NULL, theta_g = NULL,
                 nu = NULL, sigma2 = NULL, logN = FALSE, beta0 = NULL, hom_ll, env = NULL){
    
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      idx <- length(init$theta)
      theta <- par[1:idx]
      idx <- idx + 1
    }
    if(is.null(Delta)){
      Delta <- par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
    }
    if(jointThetas && is.null(k_theta_g)){
      k_theta_g <- par[idx]
      idx <- idx + 1
    }
    if(!jointThetas && is.null(theta_g)){
      theta_g <- par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
    }
    
    if(is.null(sigma2)){
      sigma2 <- par[idx]
      idx <- idx + 1
    }
    
    if(is.null(nu)){
      nu <- par[idx]
      idx <- idx + 1
    }
    
    if(is.null(g)){
      g <- par[idx]
    }
    
    loglik <-logLik_HetTP(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
                          nu = nu, sigma2 = sigma2, logN = logN, beta0 = beta0, covtype = covtype, eps = eps,
                          penalty = penalty, hom_ll = hom_ll, env = env, trace = trace)
    
    if(!is.null(env) && !is.na(loglik)){
      if(is.null(env$max_loglik) || loglik > env$max_loglik){
        env$max_loglik <- loglik
        env$arg_max <- par
      }
    }
    
    return(loglik)
  }
  
  
  gr <- function(par, X0, Z0, Z, mult, Delta = NULL, theta = NULL, g = NULL, k_theta_g = NULL, theta_g = NULL, nu = NULL, sigma2 = NULL, logN = FALSE,
                 beta0 = NULL, pX = NULL, hom_ll, env = NULL){
    
    idx <- 1 # to store the first non used element of par
    
    if(is.null(theta)){
      theta <- par[1:length(init$theta)]
      idx <- idx + length(init$theta)
    }
    if(is.null(Delta)){
      Delta <- par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
    }
    if(jointThetas && is.null(k_theta_g)){
      k_theta_g <- par[idx]
      idx <- idx + 1
    }
    if(!jointThetas && is.null(theta_g)){
      theta_g <- par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
    }
    if(is.null(sigma2)){
      sigma2 <- par[idx]
      idx <- idx + 1
    }
    if(is.null(nu)){
      nu <- par[idx]
      idx <- idx + 1
    }
    if(is.null(g)){
      g <- par[idx]
    }
    
    # c(grad(logLikHet_Wood_Ani_snugs_pX, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, x = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet_Wood_Ani_snugs_pX, X0 = X0, Z0 = Z0, Z = Z, mult = mult, x = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet_Wood_Ani_snugs_pX, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, x = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX),
    #   grad(logLikHet_Wood_Ani_snugs_pX, X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, x = g, k_theta_g = k_theta_g, theta_g = theta_g,
    #        logN = logN, SiNK = SiNK, beta0 = beta0, pX = pX)
    # )
    # grad(fn, x = par, X0 = X0, Z0 = Z0, Z = Z, mult = mult, logN = logN, beta0 = beta0, method.args=list(r = 6))
    
    return(dlogLikHetTP(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = Delta, theta = theta, g = g, k_theta_g = k_theta_g, theta_g = theta_g,
                        nu = nu, sigma2 = sigma2, logN = logN, beta0 = beta0, components = components, covtype = covtype, eps = eps,
                        penalty = penalty, hom_ll = hom_ll, env = env))
    
  }
  
  ## Pre-processing for heterogeneous fit
  parinit <- lowerOpt <- upperOpt <- NULL
  
  if(trace > 2)
    cat("Initial value of the parameters:\n")
  
  if(is.null(known$theta)){
    parinit <- init$theta
    lowerOpt <- lower
    upperOpt <- upper
    if(trace > 2) cat("Theta: ", init$theta, "\n")
  }
  
  if(is.null(known$Delta)){
    
    if(is.null(noiseControl$lowerDelta) || length(noiseControl$lowerDelta) != n){
      if(logN){
        noiseControl$lowerDelta <- log(eps)
      }else{
        noiseControl$lowerDelta <- eps
      }
    }
    
    if(length(noiseControl$lowerDelta) == 1){
      noiseControl$lowerDelta <- rep(noiseControl$lowerDelta, n)
    }
    
    if(is.null(noiseControl$g_max)) noiseControl$g_max <- 1e2 
    
    if(is.null(noiseControl$upperDelta) || length(noiseControl$upperDelta) != n){
      if(logN){
        noiseControl$upperDelta <- log(noiseControl$g_max)
      }else{
        noiseControl$upperDelta <- noiseControl$g_max
      }
    }
    
    if(length(noiseControl$upperDelta) == 1){
      noiseControl$upperDelta <- rep(noiseControl$upperDelta, n)
    }
    
    lowerOpt <- c(lowerOpt, noiseControl$lowerDelta)
    upperOpt <- c(upperOpt, noiseControl$upperDelta)
    parinit <- c(parinit, init$Delta)
    
    if(trace > 2) cat("Delta: ", init$Delta, "\n")
  }
  
  if(jointThetas && is.null(known$k_theta_g)){
    parinit <- c(parinit, init$k_theta_g)
    lowerOpt <- c(lowerOpt, noiseControl$k_theta_g_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$k_theta_g_bounds[2])
    if(trace > 2) cat("k_theta_g: ", init$k_theta_g, "\n")
  }
  
  if(!jointThetas && is.null(known$theta_g)){
    parinit <- c(parinit, init$theta_g)
    lowerOpt <- c(lowerOpt, noiseControl$lowerTheta_g)
    upperOpt <- c(upperOpt, noiseControl$upperTheta_g)
    if(trace > 2) cat("theta_g: ", init$theta_g, "\n")
  }
  
  if(is.null(known$sigma2)){
    parinit <- c(parinit, init$sigma2)
    lowerOpt <- c(lowerOpt, noiseControl$sigma2_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$sigma2_bounds[2])
  }
  
  if(is.null(known$nu)){
    parinit <- c(parinit, init$nu)
    lowerOpt <- c(lowerOpt, noiseControl$nu_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$nu_bounds[2])
  }
  
  if(is.null(known$g)){
    parinit <- c(parinit, init$g)
    if(trace > 2) cat("g: ", init$g, "\n")
    
    lowerOpt <- c(lowerOpt, noiseControl$g_bounds[1])
    upperOpt <- c(upperOpt, noiseControl$g_bounds[2])
  }
  
  if("pX" %in% components){
    parinit <- c(parinit, as.vector(init$pX))
    lowerOpt <- c(lowerOpt, as.vector(rep(noiseControl$lowerpX, times = nrow(init$pX))))
    upperOpt <- c(upperOpt, as.vector(rep(noiseControl$upperpX, times = nrow(init$pX))))
    if(trace > 2) cat("pX: ", as.vector(init$pX), "\n")
  }
  
  
  ## Case when some parameters need to be estimated
  mle_par <- known # Store infered and known parameters
  if(!is.null(components)){
    
    if(!is.null(modHom)){
      hom_ll <- modHom$ll
    }else{
      ## Compute reference homoskedastic likelihood, with fixed theta for speed
      modHom_tmp <- mleHomTP(X = list(X0 = X0, Z0 = Z0, mult = mult), Z = Z, lower = lower, upper = upper, upper,
                             known = list(theta = known[["theta"]], g = known$g_H, nu = known$nu, sigma2 = known$sigma2, beta0 = 0),
                             covtype = covtype, init = init,
                             noiseControl = noiseControl, eps = eps, settings = list(return.Ki = F))
      
      hom_ll <- modHom_tmp$ll
    } 
    
    ## Maximization of the log-likelihood
    envtmp <- environment()
    out <- try(optim(par = parinit, fn = fn, gr = gr, method = "L-BFGS-B", lower = lowerOpt, upper = upperOpt,
                 X0 = X0, Z0 = Z0, Z = Z, mult = mult, logN = logN, Delta = known$Delta, theta = known$theta,
                 g = known$g, k_theta_g = known$k_theta_g, theta_g = known$theta_g, hom_ll = hom_ll, env = envtmp,
                 nu = known$nu, beta0 = known$beta0, sigma2 = known$sigma2, 
                 control = list(fnscale = -1, maxit = maxit, factr = settings$factr, pgtol = settings$pgtol)))
    
    ## Catch errors when at least one likelihood evaluation worked
    if(is(out, "try-error"))
      out <- list(par = envtmp$arg_max, value = envtmp$max_loglik, counts = NA,
                  message = "Optimization stopped due to NAs, use best value so far")
    
    ## Temporary
    if(trace > 0){
      print(out$message)
      cat("Number of variables at boundary values:" , length(which(out$par == upperOpt)) + length(which(out$par == lowerOpt)), "\n")
    }
    
    if(trace > 1)
      cat("Name | Value | Lower bound | Upper bound \n")
    
    ## Post-processing
    idx <- 1
    if(is.null(known[["theta"]])){
      mle_par$theta <- out$par[1:length(init$theta)]
      idx <- idx + length(init$theta)
      if(trace > 1) cat("Theta |", mle_par$theta, " | ", lower, " | ", upper, "\n")
    }

    if(is.null(known$Delta)){
      mle_par$Delta <- out$par[idx:(idx - 1 + length(init$Delta))]
      idx <- idx + length(init$Delta)
      if(trace > 1){
        for(ii in 1:ceiling(length(mle_par$Delta)/5)){
          i_tmp <- (1 + 5*(ii-1)):min(5*ii, length(mle_par$Delta))
          if(logN) cat("Delta |", mle_par$Delta[i_tmp], " | ", pmax(log(eps * mult[i_tmp]), init$Delta[i_tmp] - log(1000)), " | ", init$Delta[i_tmp] + log(100), "\n")
          if(!logN) cat("Delta |", mle_par$Delta[i_tmp], " | ", pmax(mult[i_tmp] * eps, init$Delta[i_tmp] / 1000), " | ", init$Delta[i_tmp] * 100, "\n")
        }
      }
    }
    
    if(jointThetas){
      if(is.null(known$k_theta_g)){
        mle_par$k_theta_g <- out$par[idx]
        idx <- idx + 1
      } 
      mle_par$theta_g <- mle_par$k_theta_g * mle_par$theta
      
      if(trace > 1) cat("k_theta_g |", mle_par$k_theta_g, " | ", noiseControl$k_theta_g_bounds[1], " | ", noiseControl$k_theta_g_bounds[2], "\n")
    }
    
    if(!jointThetas && is.null(known$theta_g)){
      mle_par$theta_g <- out$par[idx:(idx - 1 + length(init$theta_g))]
      idx <- idx + length(init$theta_g)
      if(trace > 1) cat("theta_g |", mle_par$theta_g, " | ", noiseControl$lowerTheta_g, " | ", noiseControl$upperTheta_g, "\n")
    }
    
    if(is.null(known$sigma2)){
      mle_par$sigma2 <- out$par[idx]
      idx <- idx + 1
    }
    
    if(is.null(known$nu)){
      mle_par$nu <- out$par[idx]
      idx <- idx + 1
    }
    
    if(is.null(known$g)){
      mle_par$g <- out$par[idx]
      idx <- idx + 1
      if(trace > 1) cat("g |", mle_par$g, " | ", noiseControl$g_bounds[1], " | ", noiseControl$g_bounds[2], "\n")
    }
    # if(idx != (length(out$par) + 1)){
    #   mle_par$pX <- matrix(out$par[idx:length(out$par)], ncol = ncol(X0))
    #   if(trace > 1) cat("pX |", as.vector(mle_par$pX), " | ", as.vector(rep(noiseControl$lowerpX, times = nrow(init$pX))), " | ", as.vector(rep(noiseControl$upperpX, times = nrow(init$pX))), "\n")
    # }
    
  }else{
    out <- list(message = "All hyperparameters given, no optimization \n", count = 0, value = NULL)
  }
  
  ## Computation of nu2
  
  if(penalty){
    ll_non_pen <- logLik_HetTP(X0 = X0, Z0 = Z0, Z = Z, mult = mult, Delta = mle_par$Delta, theta = mle_par$theta, g = mle_par$g,
                               k_theta_g = mle_par$k_theta_g, theta_g = mle_par$theta_g, sigma2 = mle_par$sigma2, nu = mle_par$nu,
                               logN = logN, beta0 = mle_par$beta0, covtype = covtype, eps = eps, penalty = FALSE, trace = trace)
  }else{
    ll_non_pen <- out$value
  }
  # if(!is.null(modHom)){
  #   if(modHom$ll >= ll_non_pen){
  #     if(trace >= 0) cat("homoskedastic model has higher log-likelihood: ", modHom$ll, " compared to ", ll_non_pen, "\n")
  #     if(settings$checkHom){
  #       if(trace >= 0) cat("Return homoskedastic model \n")
  #       return(modHom)
  #     }
  #   }
  # }
  
  # if(is.null(mle_par$pX)){
  if(jointThetas){
    Cg <- cov_gen(X1 = X0, theta = mle_par$k_theta_g * mle_par$theta, type = covtype)
  }else{
    Cg <- cov_gen(X1 = X0, theta = mle_par$theta_g, type = covtype)
  } 
  Kgi <- chol2inv(chol(Cg + diag(eps + mle_par$g/mult)))
  
  nmean <- drop(rowSums(Kgi) %*% mle_par$Delta / sum(Kgi)) ## ordinary kriging mean
  
  nu_hat_var <- max(eps, drop(crossprod(mle_par$Delta - nmean, Kgi) %*% (mle_par$Delta - nmean))/length(mle_par$Delta))
  
  # if(SiNK){
  #   rhox <- 1 / rho_AN(xx = X0, X0 = mle_par$pX, theta_g = mle_par$theta_g, g = mle_par$g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
  #   M <-  rhox * Cg %*% (Kgi %*% (mle_par$Delta - nmean))
  # }else{
  M <- Cg %*% (Kgi %*% (mle_par$Delta - nmean))
  # }
  # }else{
  #   Cg <- cov_gen(X1 = mle_par$pX, theta = mle_par$theta_g, type = covtype)
  #   Kgi <- chol2inv(chol(Cg + diag(eps + mle_par$g/mult)))
  #   
  #   kg <- cov_gen(X1 = X0, X2 = mle_par$pX, theta = mle_par$theta_g, type = covtype)
  #   
  #   nmean <- drop(rowSums(Kgi) %*% mle_par$Delta / sum(Kgi)) ## ordinary kriging mean
  #   
  #   nu_hat_var <- max(eps, drop(crossprod(mle_par$Delta - nmean, Kgi) %*% (mle_par$Delta - nmean))/length(mle_par$Delta))
  #   
  #   if(SiNK){
  #     rhox <- 1 / rho_AN(xx = X0, X0 = mle_par$pX, theta_g = mle_par$theta_g, g = mle_par$g, type = covtype, eps = eps, SiNK_eps = SiNK_eps, mult = mult)
  #     M <-  rhox * kg %*% (Kgi %*% (mle_par$Delta - nmean))
  #   }else{
  #     M <- kg %*% (Kgi %*% (mle_par$Delta - nmean))
  #   }
  # }
  
  
  Lambda <- drop(nmean + M)
  
  if(logN){
    Lambda <- exp(Lambda)
  }
  else{
    Lambda[Lambda <= 0] <- eps
  }
  # Lambda <- pmax(mult * eps, Lambda)
  
  LambdaN <- rep(Lambda, times = mult)
  
  Ki <- chol2inv(chol(add_diag(mle_par$sigma2 * cov_gen(X1 = X0, theta = mle_par$theta, type = covtype), Lambda/mult + eps))) 
  ldetKi <- determinant(Ki, logarithm = TRUE)[[1]][1]
  
  if(is.null(known$beta0))
    mle_par$beta0 <- drop(colSums(Ki) %*% Z0 / sum(Ki))
  
  psi_0 <- drop(crossprod(Z0 - mle_par$beta0, Ki) %*% (Z0 - mle_par$beta0))
  
  psi <- (crossprod((Z - mle_par$beta0)/LambdaN, Z - mle_par$beta0) - crossprod((Z0 - mle_par$beta0) * mult/Lambda, Z0 - mle_par$beta0) + psi_0)
  
  res <- list(theta = mle_par$theta, Delta = mle_par$Delta, psi = as.numeric(psi), beta0 = mle_par$beta0,
              k_theta_g = mle_par$k_theta_g, theta_g = mle_par$theta_g, g = mle_par$g, nmean = nmean, Lambda = Lambda,
              ll = out$value, ll_non_pen = ll_non_pen, nit_opt = out$counts, logN = logN, covtype = covtype, pX = mle_par$pX, msg = out$message,
              X0 = X0, Z0 = Z0, Z = Z, mult = mult, trendtype = trendtype, eps = eps,
              nu_hat_var = nu_hat_var, call = match.call(), nu = mle_par$nu, sigma2 = mle_par$sigma2,
              used_args = list(noiseControl = noiseControl, settings = settings, lower = lower, upper = upper, known = known),
              time = proc.time()[3] - tic)
  
  if(settings$return.matrices){
    res <- c(res, list(Ki = Ki, Kgi = Kgi))
  }
  
  if(settings$return.hom){
    res <- c(res, list(modHom = modHom, modNugs = modNugs))
  }
  
  class(res) <- "hetTP"
  
  return(res)
  
}

#'Student-t process predictions using a heterogeneous noise TP object (of class \code{hetTP}) 
#' @param x matrix of designs locations to predict at
#' @param object an object of class \code{hetTP}; e.g., as returned by \code{\link[hetGP]{mleHetTP}}
#' @param noise.var should the variance of the latent variance process be returned?
#' @param xprime optional second matrix of predictive locations to obtain the predictive covariance matrix between \code{x} and \code{xprime}
#' @param nugs.only if TRUE, only return noise variance prediction
#' @param ... no other argument for this method.
#' @return list with elements
#' \itemize{
#' \item \code{mean}: kriging mean;
#' \item \code{sd2}: kriging variance (filtered, e.g. without the nugget values)
#' \item \code{nugs}: noise variance
#' \item \code{sd2_var}: (optional) kriging variance of the noise process (i.e., on log-variances if \code{logN = TRUE})
#' \item \code{cov}: (optional) predictive covariance matrix between x and xprime
#' }
#' @details The full predictive variance corresponds to the sum of \code{sd2} and \code{nugs}.
#' @method predict hetTP 
#' @export
predict.hetTP <- function(object, x, noise.var = FALSE, xprime = NULL, nugs.only = FALSE, ...){
  
  if(is.null(dim(x))){
    x <- matrix(x, nrow = 1)
    if(ncol(x) != ncol(object$X0)) stop("x is not a matrix")
  }
  
  if(!is.null(xprime) && is.null(dim(xprime))){
    xprime <- matrix(xprime, nrow = 1)
    if(ncol(xprime) != ncol(object$X0)) stop("xprime is not a matrix")
  }
  
  n1 <- length(object$Z)
  if(is.null(object$Kgi)){
    if(is.null(object$pX)){
      Cg <- cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype)
    }else{
      Cg <- cov_gen(X1 = object$pX, theta = object$theta_g, type = object$covtype)
    }
    object$Kgi <- chol2inv(chol(add_diag(Cg, object$eps + object$g/object$mult)))
  }
  
  # if(is.null(object$pX)){
  kg <- cov_gen(X1 = x, X2 = object$X0, theta = object$theta_g, type = object$covtype)
  # }else{
  #   kg <- cov_gen(X1 = x, X2 = object$pX, theta = object$theta_g, type = object$covtype)
  # }
  
  if(is.null(object$Ki))
    object$Ki <- chol2inv(chol(add_diag(object$sigma2 * cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps)))
  
  # if(object$SiNK){
  #   M <-  1/rho_AN(xx = x, X0 = object$pX, theta_g = object$theta_g, g = object$g,
  #                  type = object$covtype, SiNK_eps = object$SiNK_eps, eps = object$eps, mult = object$mult) * kg %*% (object$Kgi %*% (object$Delta - object$nmean))
  # }else{
  M <- kg %*% (object$Kgi %*% (object$Delta - object$nmean))
  # }
  
  if(object$logN){
    nugs <- exp(drop(object$nmean + M))
  }else{
    nugs <- pmax(0, drop(object$nmean + M))
  }
  
  if(nugs.only){
    return(list(nugs = nugs))
  }
  
  # if(noise.var){
  #   if(is.null(object$nu_hat_var))
  #     object$nu_hat_var <- max(object$eps, drop(crossprod(object$Delta - object$nmean, object$Kgi) %*% (object$Delta - object$nmean))/length(object$Delta)) ## To avoid 0 variance
  #   sd2var = object$nu_hat * object$nu_hat_var* drop(1 - fast_diag(kg, tcrossprod(object$Kgi, kg)) + (1 - tcrossprod(rowSums(object$Kgi), kg))^2/sum(object$Kgi))
  # }else{
  #   sd2var = NULL
  # }
  
  kx <- object$sigma2 * cov_gen(X1 = x, X2 = object$X0, theta = object$theta, type = object$covtype)
  
  # if(object$trendtype == 'SK'){
  sd2 <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * drop(object$sigma2 - fast_diag(kx, tcrossprod(object$Ki, kx)))
  # }else{
  #   sd2 <- drop(object$sigma2 *  - fast_diag(kx, tcrossprod(object$Ki, kx)) + (1 - tcrossprod(rowSums(object$Ki), kx))^2/sum(object$Ki))
  # }
  
  ## In case of numerical errors, some sd2 values may become negative
  if(any(sd2 < 0)){
    # object$Ki <- ginv(add_diag(object$sigma2 * cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps))
    # sd2 <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * drop(object$sigma2 - fast_diag(kx, tcrossprod(object$Ki, kx)))
    
    sd2 <- pmax(0, sd2)
    warning("Numerical errors caused some negative predictive variances to be thresholded to zero. Consider using ginv via rebuild.hetTP")
  }
  
  
  if(!is.null(xprime)){
    kxprime <- object$sigma2 * cov_gen(X1 = object$X0, X2 = xprime, theta = object$theta, type = object$covtype)
    # if(object$trendtype == 'SK'){
    if(nrow(x) > nrow(xprime)){
      cov <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * (object$sigma2 * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% object$Ki %*% kxprime)
    }else{
      cov <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * (object$sigma2 * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - (kx %*% object$Ki) %*% kxprime)
    }
    
    # }else{
    #   cov <- (object$nu + object$psi - 2) / (object$nu + n1 - 2) * (object$sigma2 * cov_gen(X1 = x, X2 = xprime, theta = object$theta, type = object$covtype) - kx %*% tcrossprod(object$Ki, kxprime) + crossprod(1 - tcrossprod(rowSums(object$Ki), kx), 1 - tcrossprod(rowSums(object$Ki), kxprime))/sum(object$Ki))
    # }
  }else{
    cov = NULL
  }
  
  return(list(mean = drop(object$beta0 + kx %*% (object$Ki %*% (object$Z0 - object$beta0))),
              sd2 = sd2,
              nugs = nugs,
              # sd2var = sd2var,
              cov = cov))
  
}


#' @method summary hetTP
#' @export
summary.hetTP <- function(object,...){
  ans <- object
  class(ans) <- "summary.hetTP"
  ans
}

#' @export
print.summary.hetTP <- function(x, ...){
  cat("N = ", length(x$Z), " n = ", length(x$Z0), " d = ", ncol(x$X0), "\n")
  cat(x$covtype, " covariance lengthscale values of the main process: ", x$theta, "\n")
  cat("Variance/scale hyperparameter: ", x$sigma2, "\n")
  cat("Degree of freedom nu: ", x$nu, "\n")
  
  cat("Summary of Lambda values: \n")
  print(summary(x$Lambda))
  
  cat("Given constant trend value: ", x$beta0, "\n")
  
  
  if(x$logN){
    cat(x$covtype, " covariance lengthscale values of the log-noise process: ", x$theta_g, "\n")
    cat("Nugget of the log-noise process: ", x$g, "\n")
    cat("Estimated constant trend value of the log-noise process: ", x$nmean, "\n")
  }else{
    cat(x$covtype, " covariance lengthscale values of the log-noise process: ", x$theta_g, "\n")
    cat("Nugget of the noise process: ", x$g, "\n")
    cat("Estimated constant trend value of the noise process: ", x$nmean, "\n")
  }
  
  
  cat("MLE optimization: \n", "Log-likelihood = ", x$ll, "; Nb of evaluations (obj, gradient) by L-BFGS-B: ", x$nit_opt, "; message: ", x$msg, "\n")
  
}

#' @method print hetTP
#' @export
print.hetTP <- function(x, ...){
  print(summary(x))
}

#' @method plot hetTP
#' @export
plot.hetTP <- function(x, ...){
  LOOpreds <- LOO_preds(x)
  plot(x$Z, LOOpreds$mean[rep(1:nrow(x$X0), times = x$mult)], xlab = "Observed values", ylab = "Predicted values",
       main = "Leave-one-out predictions")
  arrows(x0 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qt(0.05, df = x$nu + nrow(x$X0)),
         x1 = LOOpreds$mean + sqrt(LOOpreds$sd2) * qt(0.95, df = x$nu + nrow(x$X0)),
         y0 = LOOpreds$mean, length = 0, col = "blue")
  points(x$Z0[which(x$mult > 1)], LOOpreds$mean[which(x$mult > 1)], pch = 20, col = 2)
  abline(a = 0, b = 1, lty = 3)
  legend("topleft", pch = c(1, 20, NA), lty = c(NA, NA, 1), col = c(1, 2, 4),
         legend = c("observations", "averages (if > 1 observation)", "LOO prediction interval"))
}

## ' Rebuild inverse covariance matrices of \code{hetTP} (e.g., if exported without inverse matrices \code{Kgi} and/or \code{Ki})
## ' @param object \code{hetGP} model without slots \code{Ki} and/or \code{Kgi} (inverse covariance matrices)
## ' @param robust if \code{TRUE} \code{\link[MASS]{ginv}} is used for matrix inversion, otherwise it is done via Cholesky.
#' @rdname ExpImp
#' @method rebuild hetTP
#' @export
rebuild.hetTP <- function(object, robust = FALSE){
  if(robust){
    object$Kgi <- ginv(add_diag(cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype), object$eps + object$g/object$mult))
    
    object$Ki <- ginv(add_diag(object$sigma2 * cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps))
  }else{
    object$Kgi <- chol2inv(chol(add_diag(cov_gen(X1 = object$X0, theta = object$theta_g, type = object$covtype), object$eps + object$g/object$mult)))
    
    object$Ki <- chol2inv(chol(add_diag(object$sigma2 * cov_gen(X1 = object$X0, theta = object$theta, type = object$covtype), object$Lambda/object$mult + object$eps)))
  }
  return(object)
}
Any scripts or data that you put into this service are public.
hetGP documentation built on Sept. 11, 2024, 6:56 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
hetGP
Heteroskedastic Gaussian Process Modeling and Design under Replication

R/hetTP.R
In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication

Defines functions rebuild.hetTP plot.hetTP print.hetTP print.summary.hetTP summary.hetTP predict.hetTP mleHetTP dlogLikHetTP logLik_HetTP rebuild.homTP predict.homTP plot.homTP print.homTP print.summary.homTP summary.homTP mleHomTP dlogLikHomTP loglik_HomTP

Documented in mleHetTP mleHomTP predict.hetTP predict.homTP rebuild.hetTP rebuild.homTP

Try the hetGP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hetGP Heteroskedastic Gaussian Process Modeling and Design under Replication

R/hetTP.R In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication

Defines functions rebuild.hetTP plot.hetTP print.hetTP print.summary.hetTP summary.hetTP predict.hetTP mleHetTP dlogLikHetTP logLik_HetTP rebuild.homTP predict.homTP plot.homTP print.homTP print.summary.homTP summary.homTP mleHomTP dlogLikHomTP loglik_HomTP

Documented in mleHetTP mleHomTP predict.hetTP predict.homTP rebuild.hetTP rebuild.homTP

Try the hetGP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hetGP
Heteroskedastic Gaussian Process Modeling and Design under Replication

R/hetTP.R
In hetGP: Heteroskedastic Gaussian Process Modeling and Design under Replication