if(! "params" %in% ls()) params <- list(root.dir=NULL) knitr::opts_chunk$set(fig.align="center")
License: CC BY-SA 4.0
References:
This R chunk is used to assess how much time it takes to execute the R code in this document until the end:
t0 <- proc.time()
Load the required external packages:
options(digits=5) suppressPackageStartupMessages(library(scrm)) suppressPackageStartupMessages(library(MASS)) suppressPackageStartupMessages(library(rutilstimflutre))
Set the default values of the parameters:
inferPkg <- ""
Get the settings from the command-line:
if(! is.null(params$rmd.params)) for(i in seq_along(params$rmd.params)) assign(names(params$rmd.params)[i], params$rmd.params[[i]])
Check the parameters:
stopifnot(inferPkg %in% c("", "lme4", "sommer", "breedR", "rstan", "jags")) if(! inferPkg == ""){ suppressPackageStartupMessages(library(package=inferPkg, character.only=TRUE)) }
Notations:
$I$: number of genotypes
$T$: number of traits
$Q$: number of replicates
$N$: number of phenotypes per trait (here $I \times Q$)
$P$: number of SNPs
$Y$: $N \times T$ matrix of phenotypes
$W$: $N \times Q$ design matrix relating phenotypes to replicates
$C$: $Q \times T$ matrix of "fixed-effect" parameters
$Z$: $N \times I$ design matrix relating phenotypes to genotypes
$G_A$: $I \times T$ matrix of "random" additive genotypic values, so-called "breeding values"
$A$: $I \times I$ matrix of additive genetic relationships
$X$: $I \times P$ matrix of bi-allelic SNP genotypes encoded as allele dose in ${0,1,2}$
$V_{G_A}$: $T \times T$ matrix of genetic variances and covariances
$E$: $N \times T$ matrix of errors
$V_E$: $T \times T$ matrix of error variances and covariances
Model:
[ Y = W C + Z G_A + E ]
with:
$G_A \sim \mathcal{MN}(0, A, V_{G_A})$;
$E \sim \mathcal{MN}(0, Id, V_E)$.
where $\mathcal{MN}_{n \times d}(M, U, V)$ represents a matrix Normal distribution with mean matrix $M$ of dimension $n \times d$, and variance-covariance matrices $U$ and $V$ of respective dimensions $n \times n$ and $d \times d$.
Given that any matrix Normal distribution is equivalent to a multivariate Normal distribution, $\text{MVN}(0, V \otimes U)$ of dimension $nd \times 1$, it is useful to re-write the likelihood using the $vec$ operator and Kronecker product ($\otimes$):
[ vec(Y) = (Id_T \otimes W) vec(C) + (Id_T \otimes Z) vec(G_A) + vec(E) ]
with:
$vec(G_A) \sim \text{MVN}(0, V_{G_A} \otimes A)$;
$vec(E) \sim \text{MVN}(0, V_E \otimes Id_N)$.
To allow reproducibility, set the seed of the generator of pseudo-random numbers:
set.seed(1859)
Set dimensions:
T <- 2 # number of traits (don't change in this vignette) I <- 100 # number of genotypes Q <- 10 # number of replicates per genotype N <- I * Q # number of phenotypes per trait
Simulate haplotypes via the coalescent with recombination, and encode the corresponding bi-allelic SNP genotypes additively as allele doses in ${0,1,2}$:
Ne <- 10^4 chrom.len <- 10^5 mu <- 10^(-8) c <- 10^(-8) genomes <- simulCoalescent(nb.inds=I, pop.mut.rate=4 * Ne * mu * chrom.len, pop.recomb.rate=4 * Ne * c * chrom.len, chrom.len=chrom.len) X <- genomes$genos (P <- ncol(X))
Estimate additive genetic relationships:
A <- estimGenRel(X, relationships="additive", method="vanraden1", verbose=0) summary(diag(A)) # under HWE, average should be 1 summary(A[upper.tri(A)]) # under HWE, average should be 0
Choose the parameters:
mu <- c(50, 20) # global means mean.C <- 5 sd_C <- 2 var_G_1 <- 2 # additive genotypic variance of trait 1 var_G_2 <- 4 # additive genotypic variance of trait 2 cor_G <- 0.7 # additive genotypic correlation between traits 1 and 2 cov.G <- cor_G * sqrt(var_G_1) * sqrt(var_G_2) (V_G_A <- matrix(c(var_G_1, cov.G, cov.G, var_G_2), nrow=2, ncol=2)) kappa(V_G_A) h2_1 <- 0.7 # var_G_1 / (var_G_1 + var_E_1 / Q) h2_2 <- 0.3 # var_G_2 / (var_G_2 + var_E_2 / Q) ## var_E_1 <- TODO ## var_E_2 <- TODO var_E_1 <- 5 var_E_2 <- 4 cor_E <- 0 #-0.2 cov_E <- cor_E * sqrt(var_E_1) * sqrt(var_E_2) (V_E <- matrix(c(var_E_1, cov_E, cov_E, var_E_2), nrow=2, ncol=2)) kappa(V_E)
Simulate phenotypes:
model <- simulAnimalModel(T=T, Q=Q, mu=mu, mean.C=mean.C, sd.C=sd_C, A=A, V.G.A=V_G_A, V.E=V_E) str(model$data) head(model$data) regplot(model$G.A[,1], model$G.A[,2], xlab="G_A of trait1", ylab="G_A of trait2", main="Genotypic covariation", show.crit=c("corP", "R2adj"), legend.x="bottomright") abline(v=0, h=0, col="lightgrey")
Save the truth:
truth <- list(T=T, I=I, Q=Q, N=N, A=A) truth <- append(truth, model) truthFile <- "intro-mvAM_truth.rds" saveRDS(object=truth, file=truthFile) tools::md5sum(truthFile)
Data in the "wide" format:
dat <- model$data str(dat) head(dat)
Reformat in the "long" format:
datLong <- data.frame(geno=rep(dat$geno, times=2), trait=rep(c("response1","response2"), each=nrow(dat)), year=rep(dat$year, times=2), value=c(dat$response1, dat$response2), stringsAsFactors=TRUE) str(datLong) head(datLong)
Summary statistics:
do.call(rbind, tapply(dat$response1, dat$year, summary)) do.call(rbind, tapply(dat$response2, dat$year, summary))
Plots:
hist(dat$response1, breaks="FD", col="grey", border="white", las=1, main="Trait 1") hist(dat$response2, breaks="FD", col="grey", border="white", las=1, main="Trait 2") regplot(dat$response1, dat$response2, asp=1, xlab="Trait1", ylab="Trait2", main="Phenotypic covariation", show.crit=c("corP", "R2adj"), legend.x="bottomright") abline(v=mu[1], h=mu[2], col="lightgrey") op <- par(mar=c(8,4,4,2)+0.1) boxplot(value ~ year + trait, data=datLong, las=2, main="Phenotypes per trait and year", xlab="") par(op)
lme4
)if(inferPkg == "lme4"){ st <- system.time( fit.lmer <- lmer(value ~ -1 + trait + year + trait:year + (trait-1|geno), data=datLong, control=lmerControl(optimizer="bobyqa", check.nobs.vs.nlev="ignore", check.nobs.vs.nRE="ignore"))) print(st) }
breedR
)if(inferPkg == "breedR"){ st <- system.time( fit.remlf90 <- remlf90(fixed=cbind(response1, response2) ~ year, generic=list(add=list(model$Z, A)), data=dat)) print(st) }
sommer
)if(inferPkg == "sommer"){ st <- system.time( fit.mmer <- mmer(fixed=cbind(response1, response2) ~ year, random= ~ vsr(geno, Gu=A, Gtc=unsm(T)), rcov= ~ vsr(units, Gtc=unsm(T)), data=dat, method="NR")) print(st) }
rjags
)See code in the appendix of https://arxiv.org/abs/1507.08638
tmp.file.rjags <- "intro-mvAM_fit-rjags.rds" if(inferPkg == "rjags"){ if(! file.exists(tmp.file.rjags)){ ## TODO ## forJags <- list(N=N, m=m, d=d, ## y=y[1:d,1:N], x=x[1:m,], ## b0=rep(0, m), B0=diag(0.0001, m), ## Ku=diag(1, d), Ke=diag(1, d), ## Z=rep(0, d), r=r) st <- system.time( fit.rjags <- jagsAMmv(data=tmp, relmat=list(geno.add=A), nb.chains=nb.chains, nb.iters=nb.iters, burnin=burnin, thin=thin, rm.jags.file=TRUE, verbose=1)) print(st) saveRDS(fit.rjags, tmp.file.rjags) } else{ print("load tmp file...") fit.rjags <- readRDS(tmp.file.rjags) } }
rstan
)tmp.file.rstan <- "intro-mvAM_fit-rstan.rds" if(inferPkg == "rstan"){ if(! file.exists(tmp.file.rstan)){ nb.chains <- 1 burnin <- 1 * 10^3 thin <- 1 nb.usable.iters <- 1 * 10^3 nb.iters <- ceiling(burnin + (nb.usable.iters * thin) / nb.chains) tmp <- dat colnames(tmp)[colnames(tmp) == "geno"] <- "geno.add" st <- system.time( fit.rstan <- stanAMmv(data=tmp, relmat=list(geno.add=A), nb.chains=nb.chains, nb.iters=nb.iters, burnin=burnin, thin=thin, task.id="intro-mvAM", verbose=1, rm.stan.file=TRUE, rm.sm.file=TRUE)) print(st) saveRDS(fit.rstan, tmp.file.rstan) } else{ print("load tmp file...") fit.rstan <- readRDS(tmp.file.rstan) } }
lme4
)if(inferPkg == "lme4"){ print(summary(fit.lmer)) print(VarCorr(fit.lmer)) print(as.data.frame(VarCorr(fit.lmer))) print(truth$V.G.A) print(truth$V.E) }
breedR
)if(inferPkg == "breedR"){ print(summary(fit.remlf90)) }
sommer
)if(inferPkg == "sommer"){ names(fit.mmer) print(fit.mmer$sigma) print(truth[c("V.G.A","V.E")]) ## rmse(c(fit.mmer$var.comp$add) - c(V_G_A)) ## rmse(fit.mmer$var.comp$add[1,1] - V_G_A[1,1]) ## rmse(fit.mmer$var.comp$add[2,2] - V_G_A[2,2]) ## rmse(fit.mmer$var.comp$add[1,2] - V_G_A[1,2]) ## rmse(c(fit.mmer$var.comp$Residual) - c(V_E)) ## rmse(fit.mmer$var.comp$Residual[1,1] - V_E[1,1]) ## rmse(fit.mmer$var.comp$Residual[2,2] - V_E[2,2]) ## rmse(fit.mmer$var.comp$Residual[1,2] - V_E[1,2]) rmse(fit.mmer$u.hat$add[,1] - model$G.A[,1]) rmse(fit.mmer$u.hat$add[,2] - model$G.A[,2]) cor(model$G.A[,1], fit.mmer$u.hat$add[,1]) cor(model$G.A[,2], fit.mmer$u.hat$add[,2]) regplot(model$G.A[,1], fit.mmer$u.hat$add[,1], xlab="true G_A[,1]", ylab="BLUP G_A[,1]") regplot(model$G.A[,2], fit.mmer$u.hat$add[,2], xlab="true G_A[,2]", ylab="BLUP G_A[,2]") }
rstan
)Assess convergence:
## tmp <- grep("^corr[GE]\\[", names(fit.rstan), value=TRUE) tmp <- c("corrG[1,1]", "corrG[1,2]", "corrG[2,2]", "corrE[1,1]", "corrE[1,2]", "corrE[2,2]") rstan::traceplot(fit.rstan, pars=tmp) print(fit.rstan, pars=tmp)
The ReML procedure is fast, but it does not quantify the uncertainty in variance components. It hence is required to use an additional procedures to do this, e.g. the bootstrap, which can then be computationally costly.
t1 <- proc.time() t1 - t0 print(sessionInfo(), locale=FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.