Nothing
#' Simulate Multivariate Crop Yield Data Using a Generalized Copula-GEV-BSTS Model
#'
#' This function simulates multivariate crop yield data using a time-varying copula
#' combined with Generalized Extreme Value (GEV) margins and Bayesian Structural Time Series (BSTS) models.
#'
#' @param nsim Integer, number of simulation replications.
#' @param n_train Integer, number of training observations.
#' @param n_test Integer, number of test observations.
#' @param copula Character, specifying the copula type: "Clayton", "Frank",
#' "Gumbel", "Joe", or "Gaussian".
#' @param init_params Numeric vector, initial parameter values for optimization.
#' @param fn Function, log-likelihood function for parameter estimation.
#' @param U_train Numeric matrix (n_train x D), pseudo-observations for the copula.
#' @param Z_train Numeric array (n_train x D x M), observed data for each margin and sub-feature.
#' @param X Numeric matrix (n_train x M), risk factors for the dynamic copula parameter.
#' @param Y_test Numeric matrix (n_test x D), true future values for MSE calculation.
#' @param BSTS_list List of length D, each element is a BSTS model for a different margin.
#' @return A list containing:
#' \item{optim_results}{Results from the optimization process.}
#' \item{theta_sim}{Simulated copula parameters across replications.}
#' \item{Y_sim}{Simulated final BSTS-based forecasts.}
#' \item{MSE}{Mean squared error for each simulation run.}
#' @importFrom stats cor optim rnorm dnorm qnorm predict
#' @importFrom evd rgev
#' @importFrom copula claytonCopula frankCopula gumbelCopula joeCopula normalCopula rCopula
#' @importFrom utils combn
#' @import bsts
#' @export
#'
#'
simulation_generalized <- function(
nsim = 100,
n_train,
n_test,
copula,
init_params,
fn, # generalized log-likelihood
U_train, # dimension: (n_train x D)
Z_train, # dimension: (n_train x D x M)
X, # dimension: (n_train x M)
Y_test, # dimension: (n_test x D) matrix of true future values for MSE
BSTS_list # a list of length D, each a BSTS model for margin d
){
#####################################################################
## PART 1: Basic checks & define dimension D, M
#####################################################################
supported_copulas <- c("Clayton","Frank","Gumbel","Joe","Gaussian")
if (!copula %in% supported_copulas) {
stop("Invalid copula provided. Must be one of: ", paste(supported_copulas, collapse=", "))
}
D <- ncol(U_train) # number of margins in the copula
M <- dim(Z_train)[3] # number of covariates in each margin
#####################################################################
## PART 2: Optimize log-likelihood 'fn' to get best-fit parameters
#####################################################################
# n_gev_blocks <- D * M
#
# lower_vec <- c(
# rep(-Inf, 2), # omega, alpha (any real)
# rep(-Inf, M), # gamma_1..gamma_M (any real)
# # Now repeat the pattern (phi_lower, sigma_mu_lower, sigma_gev_lower, xi_lower)
# # exactly D*M times, in the same order we parse them
# rep(c(0, 1e-6, 1e-6, -0.5), times = n_gev_blocks)
# )
#
# upper_vec <- c(
# rep( Inf, 2),
# rep( Inf, M),
# # Repeat the pattern (phi_upper, sigma_mu_upper, sigma_gev_upper, xi_upper)
# rep(c(1, Inf, Inf, 0.5), times = n_gev_blocks)
# )
optim_results <- stats::optim(
par = init_params,
fn = fn,
# method = "L-BFGS-B",
# lower = lower_vec,
# upper = upper_vec,
U = U_train,
Z = Z_train,
X = X[1:n_train, , drop=FALSE], # training portion if needed
copula = copula)
par_hat <- optim_results$par
#####################################################################
## PART 3: Parse Copula & GEV Parameters
#####################################################################
# First chunk: (omega, alpha, gamma[1..M or p])
omega <- par_hat[1]
alpha <- par_hat[2]
# Suppose gamma has length M (like your 4D code logic).
gamma <- par_hat[3:(2 + M)]
offset <- 2 + M
# Next chunk: each (d,m) has 4 GEV parameters: phi_gev, sigma_mu, sigma_gev, xi_gev
phi_gev <- matrix(NA, nrow=D, ncol=M)
sigma_mu <- matrix(NA, nrow=D, ncol=M)
sigma_gev <- matrix(NA, nrow=D, ncol=M)
xi_gev <- matrix(NA, nrow=D, ncol=M)
for (d_idx in seq_len(D)) {
for (m_idx in seq_len(M)) {
phi_gev[d_idx, m_idx] <- par_hat[offset + 1]
sigma_mu[d_idx, m_idx] <- par_hat[offset + 2]
sigma_gev[d_idx, m_idx] <- par_hat[offset + 3]
xi_gev[d_idx, m_idx] <- par_hat[offset + 4]
offset <- offset + 4
}
}
#####################################################################
## PART 4: Allocate arrays for simulation
#####################################################################
# We'll store the AR(1) location in a 4D array: (nsim x n_test x D x M).
mu_gev <- array(NA, dim=c(nsim, n_test, D, M))
# We'll store the GEV draws themselves similarly:
Z_sim <- array(NA, dim=c(nsim, n_test, D, M))
# We'll store final BSTS-based forecasts in a 3D array: (nsim x n_test x D).
Y_sim <- array(NA, dim=c(nsim, n_test, D))
# We'll store MSE in a vector of length nsim.
MSE <- numeric(nsim)
# Copula parameter storage:
if (copula == "Gaussian"&& D>2) {
nPairs <- D*(D-1)/2
# We'll store a param for each replicate i, each time t, each pair p:
theta_mat <- array(NA, dim=c(nsim, n_test, nPairs))
} else {
# For archimedean: one param per replicate i, per time t
theta_vec <- matrix(NA, nrow=nsim, ncol=n_test)
}
#####################################################################
## PART 5: Initialize AR(1) states (time=1) & Copula param (time=1)
#####################################################################
# 5A) AR(1) location for t=1
for (d_idx in seq_len(D)) {
for (m_idx in seq_len(M)) {
# e.g. take the mean of Z_train as initial location
init_val <- colMeans(Z_train[, d_idx,])[m_idx]
mu_gev[, 1, d_idx, m_idx] <- init_val
# sample first GEV
Z_sim[, 1, d_idx, m_idx] <- evd::rgev(nsim, loc = init_val, scale = sigma_gev[d_idx,m_idx],
shape = xi_gev[d_idx,m_idx])
# min_val <- mu_gev[i_sim, 1, d_idx, m_idx] - sigma_gev[d_idx, m_idx] / xi_gev[d_idx, m_idx]
# if(Z_sim[i_sim, 1, d_idx, m_idx] < min_val){
# Z_sim[i_sim, 1, d_idx, m_idx] <- min_val + 0.001
}
}
# 5B) Copula param for t=1
if (copula == "Clayton") {
theta_vec[,1] <- clayton.theta(mean(cor(U_train, method="kendall")))
} else if (copula == "Frank") {
theta_vec[, 1] <- frank.theta(mean(cor(U_train, method ="kendall")))
} else if (copula == "Gumbel") {
theta_vec[, 1] <- GH.theta(mean(cor(U_train, method= "kendall")))
} else if (copula == "Joe") {
theta_vec[, 1] <- joe.theta(mean(cor(U_train, method= "kendall")))
} else if (copula == "Gaussian"&&D>2) {
pair_idx <- combn(D, 2)
for (p_idx in seq_len(ncol(pair_idx))) {
i_d <- pair_idx[1,p_idx]
j_d <- pair_idx[2,p_idx]
theta_mat[, 1, p_idx] <- cor(U_train[, i_d], U_train[, j_d], method="pearson")
}
}else if (copula == "Gaussian"&&D==2){
theta_vec[, 1] <- cor(U_train[,1],U_train[,2], method="pearson")
}
#####################################################################
## PART 6: Main loop: (i in 1..nsim), (t in 2..n_test), (d,m in GEV)
#####################################################################
for (i_sim in seq_len(nsim)) {
for (t_idx in 2:n_test) {
# (a) AR(1)+GEV update
for (d_idx in seq_len(D)) {
for (m_idx in seq_len(M)) {
eps_val <- stats::rnorm(1, mean = 0, sd = max(sigma_mu[d_idx, m_idx],1e-6))
mu_gev[i_sim, t_idx, d_idx, m_idx] <-
phi_gev[d_idx,m_idx] * mu_gev[i_sim, t_idx-1, d_idx, m_idx] + eps_val
# sample from GEV
Z_sim[i_sim, t_idx, d_idx, m_idx] <- evd::rgev(1, loc = mu_gev[i_sim, t_idx, d_idx, m_idx],
scale = max(sigma_gev[d_idx,m_idx],1e-6),
shape = xi_gev[d_idx,m_idx])
# min_val <- mu_gev[i_sim, t_idx, d_idx, m_idx] - sigma_gev[d_idx, m_idx] / xi_gev[d_idx, m_idx]
# if(Z_sim[i_sim, t_idx, d_idx, m_idx] < min_val){
# Z_sim[i_sim, t_idx, d_idx, m_idx] <- min_val + 0.001
}
}
# (b) Copula param update
if (copula == "Clayton") {
old_th <- theta_vec[i_sim, t_idx-1]
raw_val <- omega + alpha * old_th + sum( apply(Z_sim[i_sim, t_idx-1, , ], 2, max, na.rm=TRUE) * gamma )
theta_vec[i_sim, t_idx] <- abs(raw_val)
} else if (copula == "Frank") {
old_th <- theta_vec[i_sim, t_idx-1]
raw_val <- omega + alpha * old_th + sum( apply(Z_sim[i_sim, t_idx-1, , ], 2, max, na.rm=TRUE) * gamma )
theta_vec[i_sim, t_idx] <- abs(raw_val)
} else if (copula == "Gumbel") {
old_th <- theta_vec[i_sim, t_idx-1]
raw_val <- abs(omega + alpha * old_th + sum( apply(Z_sim[i_sim, t_idx-1, , ], 2, max, na.rm=TRUE) * gamma )) + 1
theta_vec[i_sim, t_idx] <- raw_val
} else if (copula == "Joe") {
old_th <- theta_vec[i_sim, t_idx-1]
raw_val <- abs(omega + alpha * old_th + sum( apply(Z_sim[i_sim, t_idx-1, , ], 2, max, na.rm=TRUE) * gamma )) + 1
theta_vec[i_sim, t_idx] <- raw_val
} else if (copula == "Gaussian"&& D>2) {
old_theta <- theta_mat[i_sim, t_idx-1, ] # Previous theta values for all correlation pairs
pair_idx <- combn(D, 2) # Generate all unique region pairs (D choose 2)
num_pairs <- ncol(pair_idx) # Number of correlation pairs
# Loop over each correlation pair
for (pair in seq_len(num_pairs)) {
r1 <- pair_idx[1, pair] # First region in the pair
r2 <- pair_idx[2, pair] # Second region in the pair
# Compute the maximum for each risk factor across the two regions
# This gives a vector of length M (one per risk factor)
max_z_vals <- pmax(Z_sim[i_sim, t_idx-1, r1, ],
Z_sim[i_sim, t_idx-1, r2, ])
# Calculate raw value by summing gamma_k * (max over regions for risk factor k)
raw_val <- omega + alpha * old_theta[pair] + sum( max_z_vals * gamma )
# Update the correlation coefficient via tanh transformation
theta_mat[i_sim, t_idx, pair] <- tanh(raw_val)
}
} else if (copula == "Gaussian" && D==2){
old_th <- theta_vec[i_sim, t_idx-1]
raw_val <- tanh(omega + alpha * old_th + sum( apply(Z_sim[i_sim, t_idx-1, , ], 2, max, na.rm=TRUE) * gamma ))
theta_vec[i_sim, t_idx] <- raw_val
}
} # end for t_idx
###################################################################
## PART 7: Now sample from copula -> BSTS -> final forecasts
###################################################################
niter_bsts <- BSTS_list[[1]]$niter
u_sim <- matrix(NA, nrow=niter_bsts, ncol=D*n_test)
# build an index for columns
col_index_base <- seq(1, D*n_test - (D-1), by=D)
# For each t in 1..n_test, sample from copula param => store columns
for (t_idx in seq_len(n_test)) {
# build the copula object for replicate i_sim at time t_idx
if (copula == "Clayton") {
param_now <- theta_vec[i_sim, t_idx]
cop_obj <- copula::claytonCopula(param=param_now, dim=D)
} else if (copula == "Frank") {
param_now <- theta_vec[i_sim, t_idx]
cop_obj <- copula::frankCopula(param=param_now, dim=D)
} else if (copula == "Gumbel") {
param_now <- theta_vec[i_sim, t_idx]
cop_obj <- copula::gumbelCopula(param=param_now, dim=D)
} else if (copula == "Joe") {
param_now <- theta_vec[i_sim, t_idx]
cop_obj <- copula::joeCopula(param=param_now, dim=D)
} else if (copula == "Gaussian" && D>2) {
# build correlation matrix from theta_mat[i_sim, t_idx, ]
R <- diag(D)
pair_idx <- combn(D,2)
for (p_idx in seq_len(nPairs)) {
i_d <- pair_idx[1,p_idx]
j_d <- pair_idx[2,p_idx]
R[i_d, j_d] <- theta_mat[i_sim, t_idx, p_idx]
R[j_d, i_d] <- theta_mat[i_sim, t_idx, p_idx]
}
cop_obj <- copula::normalCopula(param = R[upper.tri(R)], dim=D, dispstr="un")
} else if (copula == "Gaussian" && D==2){
param_now <- theta_vec[i_sim, t_idx]
cop_obj <- copula::normalCopula(param=param_now, dim=D, dispstr = "un")
}
U_block <- copula::rCopula(niter_bsts, cop_obj) # (niter_bsts x D)
# store columns
col_start <- col_index_base[t_idx]
for (d_idx in seq_len(D)) {
u_sim[, col_start + (d_idx-1)] <- U_block[, d_idx]
}
}
# Now we convert each margin's columns to normal eps, call BSTS, combine
# We'll fill Y_sim[i_sim, , d_idx].
for (d_idx in seq_len(D)) {
# Identify the corresponding column index in u_sim
col_here <- col_index_base + (d_idx-1)
# Generate the noise term from the copula-based residuals
eps_draw <- stats::qnorm(u_sim[, col_here], mean=0, sd=BSTS_list[[d_idx]]$sigma.obs)
# Extract covariates only for the current t_idx
# z_new <- t(do.call(rbind, lapply(Z_sim[d_idx], function(mat) mat[i_sim, t_idx, ])))
z_new <- Z_sim[i_sim, , d_idx, ]
# Predict using BSTS with covariates
predDist_d <- predict(BSTS_list[[d_idx]], horizon=1, newdata=z_new, burn=niter_bsts*0.1)$distribution
# Apply the noise term
predictions_d <- apply(predDist_d, 2, function(col, sds) {
col - stats::rnorm(1, mean = 0, sd = sds)
}, sds = BSTS_list[[d_idx]]$sigma.obs)
# pick_idx <- sample(seq(501, niter_bsts), 1)
# Y_sim[i_sim, , d_idx] <- predDist_d[pick_idx, ] + eps_draw[sample(1), ]
Y_sim[i_sim, , d_idx] <- predictions_d[sample(1),] + eps_draw[sample(1),]
}
sqerr <- 0
for (d_idx in seq_len(D)) {
sqerr <- sqerr + (Y_sim[i_sim, , d_idx] - Y_test[,d_idx])^2
}
MSE[i_sim] <- mean(sqerr)
} # end for i_sim
#####################################################################
## PART 8: Return final results
#####################################################################
res_list <- list(
optim_results = optim_results,
# if Gaussian => return theta_mat, else => theta_vec
theta_sim = if (copula=="Gaussian" && D>2) theta_mat else theta_vec,
# Z_sim = Z_sim,
Y_sim = Y_sim,
MSE = MSE
)
return(res_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.