#' Run Hierarchical Bayesian GxE model
#'
#' @description
#' \code{model_bh_GxE} runs Hierarchical Bayesian GxE modelto get main germplasm, environment and sensitivity effects over the network
#'
#' @param data The data frame on which the model is run. It should come from \code{\link{format_data_PPBstats.data_agro}}
#'
#' @param variable The variable on which runs the model
#'
#' @param nb_iterations Number of iterations of the MCMC
#'
#' @param thin thinning interval to reduce autocorrelations between samples of the MCMC
#'
#' @param return.alpha Return the value for each germplasm main effect (alpha_i)
#' @param return.sigma_alpha Return the value of the variance of the distribution where the alpha_i come from
#'
#' @param return.beta Return the value for each sensitivity to environments (beta_i)
#' @param return.sigma_beta Return the value of the variance of the distribution where the beta_i come from
#'
#' @param return.theta Return the value for each environment main effect (theta_j)
#' @param return.sigma_theta Return the value of the variance of the distribution where the theta_j come from
#'
#' @param return.epsilon Return the value of the residuals of the model (epsilon_ij)
#' @param return.sigma_epsilon Return the value of the variance of the distribution where the epsilon_ij come from
#'
#' @param return.DIC Return the DIC value of the model. See details for more informations.
#'
#' @details
#'
#' Hierarchical Bayesian GxE model estimates germplasm (alpha_i), environment (theta_j) and sensitivity to interaction (beta_i) effects.
#' An environment is a combinaison of a location and a year.
#'
#' The different effects are taken in different distributions of respective variances sigma_alpha, sigma_theta and sigma_beta.
#' This model takes into acount all the information on the network in order to cope with the high disequilibrium in the dataset (i.e. high percentage of missing GxE combinaisons on the network).
#'
#' First, the additive model is done. This model gives intitial values of some parameters of the Hierarchical Finlay Wilkinson model which is done next.
#'
#' The model is run on data set where germplasms are on at least two environments.
#'
#' More information can be found in the book: https://priviere.github.io/PPBstats_book/family-2.html#model-2
#'
#' For DIC value, see ?\code{dic.samples} from the \code{rjags} package for more information.
#'
#' @return The function returns a list with
#'
#' \itemize{
#' \item "data.presence.absence.matrix": a matrix germplasm x environment with the number of occurence in the data
#' \item "model.presence.absence.matrix": a matrix germplasm x environment with the number of occurence in the data used for the model (i.e. with at least two germplasm by environments.)
#' \item "germplasm.not.used": the vector of germplasms not used in the analysis because they were not on at least two environments. If NULL, all the germplasms were used in the analysis.
#' \item "MCMC": a list with the two MCMC chains (mcmc object) from the model
#' \item "epsilon": a vector with the median value of the epsilon_ij
#' \item "DIC": the DIC value of the model
#' }
#'
#' @author Pierre Riviere for R code and Olivier David for JAGS code
#'
#' @references
#' P. Riviere, J.C. Dawson, I. Goldringer, and O. David. Hierarchical multiplicative modeling of genotype x environment interaction for flexible experiments in decentralized participatory plant breeding. In prep, 2015.
#'
#' @seealso
#' \itemize{
#' \item \code{\link{check_model}}
#' \item \code{\link{check_model.fit_model_bh_GxE}}
#' \item \code{\link{cross_validation_model_bh_GxE}}
#' \item \code{\link{predict_the_past_model_bh_GxE}}
#' }
#'
#' @import rjags
#' @import stats
#' @importFrom methods is
#'
#' @export
#'
model_bh_GxE = function(
data,
variable,
nb_iterations = 100000,
thin = 10,
return.alpha = TRUE,
return.sigma_alpha = TRUE,
return.beta = TRUE,
return.sigma_beta = TRUE,
return.theta = TRUE,
return.sigma_theta = TRUE,
return.epsilon = FALSE,
return.sigma_epsilon = TRUE,
return.DIC = FALSE
)
{
# 1. Error message and update arguments ----------
if(!is(data, "data_agro")){ stop(substitute(data), " must be formated with type = \"data_agro\", see PPBstats::format_data_PPBstats().") }
check_data_vec_variables(data, variable)
if(nb_iterations < 20000) { warning("nb_iterations is below 20 000, which seems small to get convergence in the MCMC.") }
# Get the parameters to estimate
parameters = NULL # be careful, the parameters should be in the alphabetic order
if(return.epsilon) { return.sigma_epsilon = TRUE } # Useful to standardized residuals in analyse.outputs
if(return.alpha) {parameters = c(parameters, "alpha")}
if(return.beta) {parameters = c(parameters, "beta")}
if(return.sigma_alpha) {parameters = c(parameters, "sigma_alpha")}
if(return.sigma_beta) {parameters = c(parameters, "sigma_beta")}
if(return.sigma_epsilon) {parameters = c(parameters, "sigma_epsilon")}
if(return.sigma_theta) {parameters = c(parameters, "sigma_theta")}
if(return.theta) {parameters = c(parameters, "theta")}
if(is.null(parameters) & return.epsilon == FALSE) { stop("You should choose at least one parameter to return: y, alpha, sigma_alpha, beta, sigma_beta, theta, sigma_theta, espilon or sigma_epsilon.") }
# 2. Data formating ----------
# 2.1. Get the environments ----------
environment = paste(as.character(data$location), as.character(data$year), sep = ":")
D = cbind.data.frame(
germplasm = as.factor(as.character(data$germplasm)),
environment = as.factor(environment),
variable = as.numeric(as.character(data[,variable]))
)
formule = as.formula("variable ~ germplasm + environment")
D = droplevels(aggregate(formule, FUN = mean, data = D))
# 2.2. Get only germplasm that are on at least two environments ----------
data.presence.absence.matrix = with(D, table(germplasm, environment))
t = apply(data.presence.absence.matrix, 1, sum)
germ.to.get = names(t[which(t>=2)])
germplasm.to.get = is.element(D$germplasm, germ.to.get)
germplasm.not.used = D$germplasm[!is.element(D$germplasm, germ.to.get)]
if( length(germplasm.not.used) == 0 ) { germplasm.not.used = NULL }
D = droplevels(D[germplasm.to.get,])
model.presence.absence.matrix = with(D, table(germplasm, environment))
# 3. Get the informations for the model ----------
D = D[order(D$germplasm, D$environment),]
germplasm = as.character(D$germplasm)
nb_germplasm = length(unique(germplasm))
environment = as.character(D$environment)
nb_environment = length(unique(environment))
y = as.numeric(as.character(D$variable))
nu = mean(y, na.rm = TRUE)
# Transform names with numbers to be ok with jags
l = unique(environment)
environment.names.data = l; names(environment.names.data) = seq(1, length(l), 1)
environment.names.jags = seq(1, length(l), 1); names(environment.names.jags) = l
environment = as.factor(environment.names.jags[environment])
g = unique(germplasm)
germplasm.names.data = g; names(germplasm.names.data) = seq(1, length(g), 1)
germplasm.names.jags = seq(1, length(g), 1); names(germplasm.names.jags) = g
germplasm = as.factor(germplasm.names.jags[germplasm])
d <- list(y = y, germplasm = germplasm, nb_germplasm = nb_germplasm, environment = environment, nb_environment = nb_environment, nu = nu)
# 4. Write and run the additive model to get initial value for mu (mean of alpha_i), sigma_alpha and sigma_theta of the FWH model ----------
message("Run additive Bayesian model ...")
model_add_jags ="
model
{
# vraisemblance
for (i in 1:length(y))
{
y[i] ~ dnorm(y_hat[i],tau_epsilon)
y_hat[i] <- alpha[germplasm[i]] + theta[environment[i]]
}
# prior
for (i in 1:nb_germplasm) { alpha[i] ~ dnorm(mu,tau_alpha) }
for (i in 1:nb_environment) { theta[i] ~ dnorm(0,tau_theta) }
tau_epsilon ~ dgamma(1.0E-6,1.0E-6)
sigma_epsilon <- 1/sqrt(tau_epsilon)
tau_alpha <- 1/pow(sigma_alpha,2)
sigma_alpha ~ dunif(0,nu)
tau_theta <- 1/pow(sigma_theta,2)
sigma_theta ~ dunif(0,nu)
mu ~ dnorm(nu,1/pow(nu,2))
}"
# Initial values
init1 <- list(".RNG.name"="base::Mersenne-Twister",".RNG.seed"=1234)
init2 <- list(".RNG.name"="base::Wichmann-Hill",".RNG.seed"=5678)
init <- list(init1,init2)
# Model
model <- jags.model(file = textConnection(model_add_jags), data = d, inits = init, n.chains = 2)
update(model,1000) # Burn-in
# Run the model
mcmc_add <- coda.samples(model, c("mu", "sigma_alpha", "sigma_theta"), n.iter = nb_iterations, thin = thin)
# Rename the parameters
# one again, the name of the parameters must be in the alphabetic order
n = colnames(mcmc_add[[1]])
para.name = NULL
para.name = c(para.name, "mu")
para.name = c(para.name, "sigma_alpha")
para.name = c(para.name, "sigma_theta")
colnames(mcmc_add[[1]]) = colnames(mcmc_add[[2]]) = para.name
# Get the initial value for the FWH model
mcmc_add_tmp = rbind.data.frame(mcmc_add[[1]], mcmc_add[[2]])
mu.init = median(mcmc_add_tmp[,grep("mu", colnames(mcmc_add_tmp))], na.rm = TRUE)
sigma_alpha.init = median(mcmc_add_tmp[,grep("sigma_alpha", colnames(mcmc_add_tmp))], na.rm = TRUE)
sigma_theta.init = median(mcmc_add_tmp[,grep("sigma_theta", colnames(mcmc_add_tmp))], na.rm = TRUE)
# 5. Write and run the Finlay Wilkinson Hierarchical (FWH) model ----------
message("Run Hierarchical Bayesian GxE model ...")
model_FWH_jags ="
model
{
# vraisemblance
for (i in 1:length(y))
{
y[i] ~ dnorm(y_hat[i],tau_epsilon)
y_hat[i] <- alpha[germplasm[i]] + beta[germplasm[i]]*theta[environment[i]]
epsilon[i] <- y[i] - y_hat[i]
}
# prior
for (i in 1:nb_germplasm)
{
alpha[i] ~ dnorm(mu,tau_alpha)
beta[i] ~ dnorm(1,tau_beta)
}
for (i in 1:nb_environment) { theta[i] ~ dnorm(0,tau_theta) }
tau_epsilon ~ dgamma(1.0E-6,1.0E-6)
sigma_epsilon <- 1/sqrt(tau_epsilon)
tau_alpha <- 1/pow(sigma_alpha,2)
sigma_alpha ~ dunif(0,nu)
tau_theta <- 1/pow(sigma_theta,2)
sigma_theta ~ dunif(0,nu)
tau_beta <- 1/pow(sigma_beta,2)
sigma_beta ~ dunif(0,1)
mu ~ dnorm(nu,1/pow(nu,2))
}
"
# Initial values with outputs frmo the additive model
init1 <- list(".RNG.name"="base::Mersenne-Twister", ".RNG.seed" = 1234, sigma_alpha = sigma_alpha.init, mu = mu.init, sigma_theta = sigma_theta.init)
init2 <- list(".RNG.name"="base::Wichmann-Hill", ".RNG.seed" = 5678, sigma_alpha = sigma_alpha.init, mu = mu.init, sigma_theta = sigma_theta.init)
init <- list(init1, init2)
# Model
model <- rjags::jags.model(file = textConnection(model_FWH_jags), data = d, inits = init, n.chains = 2)
# DIC for the FWH model
if(return.DIC) {
message("Calculation of DIC for FWH model ...")
DIC_FW = rjags::dic.samples(model, n.iter = nb_iterations, thin = 10, type = "pD")
} else {DIC_FW = NULL}
if( !is.null(parameters) ){
stats::update(model,1000) # Burn-in
# Run the model
mcmc_fwh <- rjags::coda.samples(model, parameters, n.iter = nb_iterations, thin = thin)
# Rename the parameters
# one again, the name of the parameters must be in the alphabetic order
n = colnames(mcmc_fwh[[1]])
para.name = NULL
if(return.alpha) {
alpha = n[grep("alpha\\[", n)]
alpha = sub("\\]", "", sub("alpha\\[", "", alpha))
alpha = paste("alpha[", germplasm.names.data[as.character(alpha)], "]", sep = "") # be careful, it must be as.character(alpha) to grep the right column name, otherwise, it is the nth position instead!
para.name = c(para.name, alpha)
}
if(return.beta) {
beta = n[grep("beta\\[", n)]
beta = sub("\\]", "", sub("beta\\[", "", beta))
beta = paste("beta[", germplasm.names.data[as.character(beta)], "]", sep = "") # be careful, cf up
para.name = c(para.name, beta)
}
if(return.sigma_alpha) { para.name = c(para.name, "sigma_alpha") }
if(return.sigma_beta) { para.name = c(para.name, "sigma_beta") }
if(return.sigma_epsilon) { para.name = c(para.name, "sigma_epsilon") }
if(return.sigma_theta) { para.name = c(para.name, "sigma_theta") }
if(return.theta) {
theta = n[grep("theta\\[", n)]
theta = sub("\\]", "", sub("theta\\[", "", theta))
theta = paste("theta[", environment.names.data[as.character(theta)], "]", sep = "") # be careful, cf up
para.name = c(para.name, theta)
}
colnames(mcmc_fwh[[1]]) = colnames(mcmc_fwh[[2]]) = para.name
# Correct the value of alpha, see vignette("PPBstats") for more details.
if( is.element("alpha", parameters) & is.element("theta", parameters)) {
MCMC1_fwh = as.data.frame(mcmc_fwh[[1]])
MCMC2_fwh = as.data.frame(mcmc_fwh[[2]])
vec_alpha = colnames(MCMC1_fwh)[grep("alpha\\[", colnames(MCMC1_fwh))]
vec_beta = colnames(MCMC1_fwh)[grep("beta\\[", colnames(MCMC1_fwh))]
theta1_bar = apply(MCMC1_fwh[,grep("theta", colnames(MCMC1_fwh))], 1, median)
MCMC1_fwh[,vec_alpha] = MCMC1_fwh[,vec_alpha] + MCMC1_fwh[,vec_beta]*theta1_bar
theta2_bar = apply(MCMC2_fwh[,grep("theta", colnames(MCMC2_fwh))], 1, median)
MCMC2_fwh[,vec_alpha] = MCMC2_fwh[,vec_alpha] + MCMC2_fwh[,vec_beta]*theta2_bar
mcmc_fwh = mcmc.list(as.mcmc(MCMC1_fwh), as.mcmc(MCMC2_fwh))
}
} else { mcmc_fwh = NULL }
# 6. For epsilon, it is done alone otherwise the memory do not manage with too big MCMC data frame. It is also useful for the cross validation study ----------
if(return.epsilon) {
mcmc_epsilon <- rjags::coda.samples(model, "epsilon", n.iter= nb_iterations, thin = thin)
MCMC_epsilon = rbind.data.frame(as.data.frame(mcmc_epsilon[[1]]), as.data.frame(mcmc_epsilon[[2]]))
epsilon = apply(MCMC_epsilon, 2, median, na.rm = TRUE)
} else {epsilon = NULL}
# 7. Return results ----------
OUT = list(
"data.presence.absence.matrix" = data.presence.absence.matrix,
"model2.presence.absence.matrix" = model.presence.absence.matrix,
"germplasm.not.used" = germplasm.not.used,
"MCMC" = mcmc_fwh,
"epsilon" = epsilon,
"DIC" = DIC_FW
)
class(OUT) <- c("PPBstats", "fit_model_bh_GxE")
return(OUT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.