Preamble

License: CC BY-SA 4.0

References:

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

External packages:

options(digits=5)
library(scrm)
library(MASS)
library(sommer)
library(breedR)
library(rstan)
library(rutilstimflutre)

To allow reproducibility, we set the seed of the generator of pseudo-random numbers:

set.seed(1859)

Statistical model

Notations:

Model:

[ Y = W C + Z G_A + E ]

with:

where $\mathcal{MN}_{n \times d}(M, U, V)$ represents a matrix-variate Normal distribution with mean matrix $M$ of dimension $n \times d$, and variance-covariance matrices $U$ and $V$ of respective dimensions $n \times n$ and $d \times d$.

Given that any matrix-variate Normal distribution is equivalent to a multivariate Normal distribution, $\text{MVN}(0, V \otimes U)$ of dimension $nd \times 1$, it is useful to re-write the likelihood using the $vec$ operator and Kronecker product ($\otimes$):

[ vec(Y) = (Id_T \otimes W) vec(C) + (Id_T \otimes Z) vec(G_A) + vec(E) ]

with:

Simulate data

Set dimensions:

T <- 2     # number of traits (don't change in this vignette)
I <- 100   # number of genotypes
Q <- 5     # number of replicates per genotype
N <- I * Q # number of phenotypes per trait

Genotypes

Simulate haplotypes via the coalescent with recombination, and encode the corresponding bi-allelic SNP genotypes additively as allele doses in ${0,1,2}$:

Ne <- 10^4
chrom.len <- 10^5
mu <- 10^(-8)
c <- 10^(-8)
genomes <- simulCoalescent(nb.inds=I,
                           pop.mut.rate=4 * Ne * mu * chrom.len,
                           pop.recomb.rate=4 * Ne * c * chrom.len,
                           chrom.len=chrom.len)
X <- genomes$genos
(P <- ncol(X))

Estimate additive genetic relationships:

A <- estimGenRel(X, relationships="additive", method="vanraden1", verbose=0)
summary(diag(A)) # under HWE, average should be 1
summary(A[upper.tri(A)]) # under HWE, average should be 0

Phenotypes

Simulate phenotypes:

mu <- c(50, 20) # global means
mean.C <- 5
sd.C <- 2
var.G.1 <- 2 # additive genotypic variance of trait 1
var.G.2 <- 4 # additive genotypic variance of trait 2
cor.G <- 0.7 # additive genotypic correlation between traits 1 and 2
cov.G <- cor.G * sqrt(var.G.1) * sqrt(var.G.2)
(V.G.A <- matrix(c(var.G.1, cov.G, cov.G, var.G.2), nrow=2, ncol=2))
var.E.1 <- 5
var.E.2 <- 4
cor.E <- -0.2
cov.E <- cor.E * sqrt(var.E.1) * sqrt(var.E.2)
(V.E <- matrix(c(var.E.1, cov.E, cov.E, var.E.2), nrow=2, ncol=2))
model <- simulAnimalModel(T=T, Q=Q, mu=mu, mean.C=mean.C, sd.C=sd.C,
                          A=A, V.G.A=V.G.A, V.E=V.E)
str(model$data)
regplot(model$G.A[,1], model$G.A[,2], xlab="G_A[,1]", ylab="G_A[,2]")

Explore the data

Summary statistics:

do.call(rbind, tapply(model$data$response1, model$data$year, summary))
do.call(rbind, tapply(model$data$response2, model$data$year, summary))

Plots:

hist(model$data$response1, breaks="FD", col="grey", border="white", las=1,
     main="Trait 1")
hist(model$data$response2, breaks="FD", col="grey", border="white", las=1,
     main="Trait 2")
regplot(model$data$response1, model$data$response2, asp=1,
        xlab="Trait1", ylab="Trait2")

Fit the model to data

Via ReML (breedR)

system.time(
    fit.remlf90 <- remlf90(fixed=cbind(response1, response2) ~ year,
                           generic=list(add=list(model$Z, A)),
                           data=model$data))

Via ReML (sommer)

system.time(
    fit.mmer <- mmer(Y=model$Y, X=model$W, Z=list(add=list(Z=model$Z, K=A)),
                     method="NR", MVM=TRUE, REML=TRUE))

Via HMC (rstan)

To efficiently sample $vec(G_A)$, we first sample from a standard univariate Normal, then multiply by the Cholesky decomposition of $V_{G_A} \otimes A$. Then, $vec(Y)$ will be sampled using multi_normal_cholesky. Write and compile the model:

stan.model <- "functions {
  // A is m x n; B is p x q
  // returns C = B %x% A which is mp x nq
  matrix kron_mat(matrix A, int m, int n, matrix B, int p, int q) {
    matrix[m*p,n*q] C;
    for(i in 1:m)
      for(j in 1:n)
        for(s in 1:p)
          for(t in 1:q)
            C[s+p*(i-1),t+q*(j-1)] = A[i,j] * B[s,t];
    return C;
  }
}
data {
  int<lower=0> I;
  int<lower=0> T;
  int<lower=0> Q;
  int<lower=0> N;
  vector[N*T] vecY;
  matrix[N,Q] W;
  vector[T] loc_mu;
  vector[T] scale_mu;
  matrix[N,I] Z;
  cov_matrix[I] A;
  cov_matrix[N] IdN;
  real nu_V_G_A;
  real nu_V_E;
}
"
stan.model.file <- "intro-mvAM_model.stan"
write(x=stan.model, file=stan.model.file)
rt <- stanc(file=stan.model.file, model_name="mvAM")
sm <- stan_model(stanc_ret=rt, verbose=FALSE)

Evaluate parameter estimates

From ReML (breedR)

summary(fit.remlf90)

From ReML (sommer)

names(fit.mmer)
fit.mmer$var.comp
rmse(c(fit.mmer$var.comp$add) - c(V.G.A))
rmse(fit.mmer$var.comp$add[1,1] - V.G.A[1,1])
rmse(fit.mmer$var.comp$add[2,2] - V.G.A[2,2])
rmse(fit.mmer$var.comp$add[1,2] - V.G.A[1,2])

rmse(c(fit.mmer$var.comp$Residual) - c(V.E))
rmse(fit.mmer$var.comp$Residual[1,1] - V.E[1,1])
rmse(fit.mmer$var.comp$Residual[2,2] - V.E[2,2])
rmse(fit.mmer$var.comp$Residual[1,2] - V.E[1,2])

rmse(fit.mmer$u.hat$add[,1] - model$G.A[,1])
rmse(fit.mmer$u.hat$add[,2] - model$G.A[,2])
cor(model$G.A[,1], fit.mmer$u.hat$add[,1])
cor(model$G.A[,2], fit.mmer$u.hat$add[,2])
regplot(model$G.A[,1], fit.mmer$u.hat$add[,1], xlab="true G_A[,1]",
        ylab="BLUP G_A[,1]")
regplot(model$G.A[,2], fit.mmer$u.hat$add[,2], xlab="true G_A[,2]",
        ylab="BLUP G_A[,2]")

From HMC

print(fit.stan, pars=grep("^V\\[", names(fit.stan), value=TRUE))

Conclusions

The ReML procedure is fast, but doesn't quantify the uncertainty in variance components. It hence is required to use an additional procedures to do this, e.g. the bootstrap, which can then be computationally costly.

Appendix

t1 <- proc.time()
t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Feb. 7, 2024, 8:17 a.m.