Nothing
#' @title cvomega
#'
#' @description This functions selects the omega tuning parameter for ridge penalization of the empirical Gaussian copula correlation matrix via cross-validation.
#' The objective function is the Gaussian log-likelihood, and a grid search is performed using K folds.
#'
#' @param sample A sample from a \eqn{q}-dimensional random vector \eqn{\mathbf{X}} (\eqn{n \times q} matrix with observations in rows, variables in columns).
#' @param omegas A grid of candidate penalty parameters in \eqn{[0,1]}.
#' @param K The number of folds to be used.
#'
#' @details The loss function is the Gaussian log-likelihood, i.e., given an estimated (penalized)
#' Gaussian copula correlation matrix (normal scores rank correlation matrix) \eqn{\widehat{\mathbf{R}}_{n}^{(-j)}} computed on a training set leaving out fold j, and
#' \eqn{\widehat{\mathbf{R}}_{n}^{(j)}} the empirical (non-penalized)
#' Gaussian copula correlation matrix computed on test fold j, we search for the tuning parameter that minimizes
#' \deqn{\sum_{j = 1}^{K} \left [\ln \left ( \left | \widehat{\mathbf{R}}_{n}^{(-j)} \right | \right ) + \text{tr} \left \{\widehat{\mathbf{R}}_{n}^{(j)} \left (\widehat{\mathbf{R}}_{n}^{(-j)} \right )^{-1} \right \} \right ].}
#' The underlying assumption is that the copula of \eqn{\mathbf{X}} is Gaussian.
#'
#' @return The optimal ridge penalty parameter minimizing the cross-validation error.
#'
#' @references
#' De Keyser, S. & Gijbels, I. (2024).
#' High-dimensional copula-based Wasserstein dependence.
#' doi: https://doi.org/10.48550/arXiv.2404.07141.
#'
#' Warton, D.I. (2008).
#' Penalized normal likelihood and ridge regularization of correlation and covariance matrices.
#' Journal of the American Statistical Association 103(481):340-349. \cr
#' doi: https://doi.org/10.1198/016214508000000021.
#'
#' @seealso \code{\link{estR}} for computing the (Ridge penalized) empirical Gaussian copula correlation matrix.
#'
#'
#' @examples
#' q = 10
#' n = 50
#'
#' # AR(1) correlation matrix with correlation 0.5
#' R = 0.5^(abs(matrix(1:q-1,nrow = q, ncol = q, byrow = TRUE) - (1:q-1)))
#'
#' # Sample from multivariate normal distribution
#' sample = mvtnorm::rmvnorm(n,rep(0,q),R,method = "chol")
#'
#' # 5-fold cross-validation with Gaussian likelihood as loss for selecting omega
#' omega = cvomega(sample = sample,omegas = seq(0.01,0.999,len = 50),K = 5)
#'
#' R_est = estR(sample,omega = omega)
#' @export
cvomega = function(sample, omegas, K){
n = nrow(sample) # Sample size
q = ncol(sample) # Total dimension
scores = matrix(0,n,q) # Matrix for normal scores
for(j in 1:q){
scores[,j] = stats::qnorm((n/(n+1)) * stats::ecdf(sample[,j])(sample[,j])) # Normal scores
}
LLs = integer(length(omegas))
for(l in 1:length(omegas)){
LLs[l] = CVLF(omegas[l],scores,K) # Objectives for each candidate omega
}
return(omegas[which.max(LLs)]) # Omega that maximizes the objective
}
# Auxiliary functions
LogL = function(data,R,omega){
# Log-likelihood of Gaussian model with correlation matrix R and penalty parameter omega.
# data is the validation data, R is the normal scores rank correlation matrix based on training data, omega is the penalty parameter.
q = nrow(R)
n = nrow(data)
sigmaD = diag(rep(sqrt(sum(stats::qnorm(seq(1,n)/(n+1))^2)/(n-1)),q)) # Diagonal matrix with normal scores standard deviations (independent of the data)
sigmaL = (1/omega) * sigmaD %*% (omega * R + (1-omega) * diag(q)) %*% sigmaD # Penalized estimate based on training data
t1 = n * q * log(2*pi) + n * log(det(sigmaL)) # First term for log-likelihood
t2 = sum(diag(data %*% solve(sigmaL) %*% t(data))) # Second term for log-likelihood
return((-1/2)*(t1 + t2))
}
CVLF = function(omega,data,K){
# Cross-validated log-likelihood function with penalty parameter omega using K folds.
# data should be normal scores.
n = nrow(data)
ind = sample(seq(1,n)) # Indices for folds
fld_size = floor(n/K) # Size of each fold
flds = list() # Folds
for(i in 1:K){
flds[[i]] = ind[((fld_size)*(i-1) + 1):(fld_size*i)] # Assign indices to each fold
}
if(n-(K*fld_size) > 0){ # If n is not a multiple of K
for(i in 1:(n-(K*fld_size))){
flds[[i]] = c(flds[[i]],ind[fld_size * K + i]) # Assign remaining indices to first folds
}
}
LL = 0 # Sum of log-likelihood objective over all folds
for(i in 1:K){
valid = data[flds[[i]],] # Validation data
train = data[setdiff(seq(1,n),flds[[i]]),] # Training data
R_est = stats::cor(train) # Normal scores rank correlation matrix computed using training data
LL = LL + LogL(valid,R_est,omega)
}
return(LL)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.