License: CC BY-SA 4.0
References:
This R chunk is used to assess how much time it takes to execute the R code in this document until the end:
t0 <- proc.time()
External packages:
options(digits=5) library(scrm) library(MASS) library(sommer) library(breedR) library(rstan) library(rutilstimflutre)
To allow reproducibility, we set the seed of the generator of pseudo-random numbers:
set.seed(1859)
Notations:
$I$: number of genotypes
$T$: number of traits
$Q$: number of replicates
$N$: number of phenotypes per trait (i.e. $I \times Q$)
$P$: number of SNPs
$Y$: $N \times T$ matrix of phenotypes
$W$: $N \times Q$ design matrix relating phenotypes to replicates
$C$: $Q \times T$ matrix of "fixed-effect" parameters
$Z$: $N \times I$ design matrix relating phenotypes to genotypes
$G_A$: $I \times T$ matrix of "random" additive genotypic values, so-called "breeding values"
$A$: $I \times I$ matrix of additive genetic relationships
$X$: $I \times P$ matrix of bi-allelic SNP genotypes encoded as allele dose in ${0,1,2}$
$V_{G_A}$: $T \times T$ matrix of genetic variances and covariances
$E$: $N \times T$ matrix of errors
$V_E$: $T \times T$ matrix of error variances and covariances
Model:
[ Y = W C + Z G_A + E ]
with:
$G_A \sim \mathcal{MN}(0, A, V_{G_A})$;
$E \sim \mathcal{MN}(0, Id, V_E)$.
where $\mathcal{MN}_{n \times d}(M, U, V)$ represents a matrix-variate Normal distribution with mean matrix $M$ of dimension $n \times d$, and variance-covariance matrices $U$ and $V$ of respective dimensions $n \times n$ and $d \times d$.
Given that any matrix-variate Normal distribution is equivalent to a multivariate Normal distribution, $\text{MVN}(0, V \otimes U)$ of dimension $nd \times 1$, it is useful to re-write the likelihood using the $vec$ operator and Kronecker product ($\otimes$):
[ vec(Y) = (Id_T \otimes W) vec(C) + (Id_T \otimes Z) vec(G_A) + vec(E) ]
with:
$vec(G_A) \sim \text{MVN}(0, V_{G_A} \otimes A)$;
$vec(E) \sim \text{MVN}(0, V_E \otimes Id_N)$.
Set dimensions:
T <- 2 # number of traits (don't change in this vignette) I <- 100 # number of genotypes Q <- 5 # number of replicates per genotype N <- I * Q # number of phenotypes per trait
Simulate haplotypes via the coalescent with recombination, and encode the corresponding bi-allelic SNP genotypes additively as allele doses in ${0,1,2}$:
Ne <- 10^4 chrom.len <- 10^5 mu <- 10^(-8) c <- 10^(-8) genomes <- simulCoalescent(nb.inds=I, pop.mut.rate=4 * Ne * mu * chrom.len, pop.recomb.rate=4 * Ne * c * chrom.len, chrom.len=chrom.len) X <- genomes$genos (P <- ncol(X))
Estimate additive genetic relationships:
A <- estimGenRel(X, relationships="additive", method="vanraden1", verbose=0) summary(diag(A)) # under HWE, average should be 1 summary(A[upper.tri(A)]) # under HWE, average should be 0
Simulate phenotypes:
mu <- c(50, 20) # global means mean.C <- 5 sd.C <- 2 var.G.1 <- 2 # additive genotypic variance of trait 1 var.G.2 <- 4 # additive genotypic variance of trait 2 cor.G <- 0.7 # additive genotypic correlation between traits 1 and 2 cov.G <- cor.G * sqrt(var.G.1) * sqrt(var.G.2) (V.G.A <- matrix(c(var.G.1, cov.G, cov.G, var.G.2), nrow=2, ncol=2)) var.E.1 <- 5 var.E.2 <- 4 cor.E <- -0.2 cov.E <- cor.E * sqrt(var.E.1) * sqrt(var.E.2) (V.E <- matrix(c(var.E.1, cov.E, cov.E, var.E.2), nrow=2, ncol=2)) model <- simulAnimalModel(T=T, Q=Q, mu=mu, mean.C=mean.C, sd.C=sd.C, A=A, V.G.A=V.G.A, V.E=V.E) str(model$data) regplot(model$G.A[,1], model$G.A[,2], xlab="G_A[,1]", ylab="G_A[,2]")
Summary statistics:
do.call(rbind, tapply(model$data$response1, model$data$year, summary)) do.call(rbind, tapply(model$data$response2, model$data$year, summary))
Plots:
hist(model$data$response1, breaks="FD", col="grey", border="white", las=1, main="Trait 1") hist(model$data$response2, breaks="FD", col="grey", border="white", las=1, main="Trait 2") regplot(model$data$response1, model$data$response2, asp=1, xlab="Trait1", ylab="Trait2")
breedR
)system.time( fit.remlf90 <- remlf90(fixed=cbind(response1, response2) ~ year, generic=list(add=list(model$Z, A)), data=model$data))
sommer
)system.time( fit.mmer <- mmer(Y=model$Y, X=model$W, Z=list(add=list(Z=model$Z, K=A)), method="NR", MVM=TRUE, REML=TRUE))
rstan
)To efficiently sample $vec(G_A)$, we first sample from a standard univariate Normal, then multiply by the Cholesky decomposition of $V_{G_A} \otimes A$.
Then, $vec(Y)$ will be sampled using multi_normal_cholesky
.
Write and compile the model:
stan.model <- "functions { // A is m x n; B is p x q // returns C = B %x% A which is mp x nq matrix kron_mat(matrix A, int m, int n, matrix B, int p, int q) { matrix[m*p,n*q] C; for(i in 1:m) for(j in 1:n) for(s in 1:p) for(t in 1:q) C[s+p*(i-1),t+q*(j-1)] = A[i,j] * B[s,t]; return C; } } data { int<lower=0> I; int<lower=0> T; int<lower=0> Q; int<lower=0> N; vector[N*T] vecY; matrix[N,Q] W; vector[T] loc_mu; vector[T] scale_mu; matrix[N,I] Z; cov_matrix[I] A; cov_matrix[N] IdN; real nu_V_G_A; real nu_V_E; } " stan.model.file <- "intro-mvAM_model.stan" write(x=stan.model, file=stan.model.file) rt <- stanc(file=stan.model.file, model_name="mvAM") sm <- stan_model(stanc_ret=rt, verbose=FALSE)
breedR
)summary(fit.remlf90)
sommer
)names(fit.mmer) fit.mmer$var.comp rmse(c(fit.mmer$var.comp$add) - c(V.G.A)) rmse(fit.mmer$var.comp$add[1,1] - V.G.A[1,1]) rmse(fit.mmer$var.comp$add[2,2] - V.G.A[2,2]) rmse(fit.mmer$var.comp$add[1,2] - V.G.A[1,2]) rmse(c(fit.mmer$var.comp$Residual) - c(V.E)) rmse(fit.mmer$var.comp$Residual[1,1] - V.E[1,1]) rmse(fit.mmer$var.comp$Residual[2,2] - V.E[2,2]) rmse(fit.mmer$var.comp$Residual[1,2] - V.E[1,2]) rmse(fit.mmer$u.hat$add[,1] - model$G.A[,1]) rmse(fit.mmer$u.hat$add[,2] - model$G.A[,2]) cor(model$G.A[,1], fit.mmer$u.hat$add[,1]) cor(model$G.A[,2], fit.mmer$u.hat$add[,2]) regplot(model$G.A[,1], fit.mmer$u.hat$add[,1], xlab="true G_A[,1]", ylab="BLUP G_A[,1]") regplot(model$G.A[,2], fit.mmer$u.hat$add[,2], xlab="true G_A[,2]", ylab="BLUP G_A[,2]")
print(fit.stan, pars=grep("^V\\[", names(fit.stan), value=TRUE))
The ReML procedure is fast, but doesn't quantify the uncertainty in variance components. It hence is required to use an additional procedures to do this, e.g. the bootstrap, which can then be computationally costly.
t1 <- proc.time() t1 - t0 print(sessionInfo(), locale=FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.