if(! "params" %in% ls())
  params <- list(root.dir=NULL)
knitr::opts_chunk$set(fig.align="center")

Preamble

License: CC BY-SA 4.0

References:

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

Set-up

Load the required external packages:

options(digits=5)
suppressPackageStartupMessages(library(scrm))
suppressPackageStartupMessages(library(MASS))
suppressPackageStartupMessages(library(rutilstimflutre))

Set the default values of the parameters:

inferPkg <- ""

Get the settings from the command-line:

if(! is.null(params$rmd.params))
  for(i in seq_along(params$rmd.params))
    assign(names(params$rmd.params)[i], params$rmd.params[[i]])

Check the parameters:

stopifnot(inferPkg %in% c("", "lme4", "sommer", "breedR", "rstan", "jags"))
if(! inferPkg == ""){
  suppressPackageStartupMessages(library(package=inferPkg, character.only=TRUE))
}

Statistical model

Notations:

Model:

[ Y = W C + Z G_A + E ]

with:

where $\mathcal{MN}_{n \times d}(M, U, V)$ represents a matrix Normal distribution with mean matrix $M$ of dimension $n \times d$, and variance-covariance matrices $U$ and $V$ of respective dimensions $n \times n$ and $d \times d$.

Given that any matrix Normal distribution is equivalent to a multivariate Normal distribution, $\text{MVN}(0, V \otimes U)$ of dimension $nd \times 1$, it is useful to re-write the likelihood using the $vec$ operator and Kronecker product ($\otimes$):

[ vec(Y) = (Id_T \otimes W) vec(C) + (Id_T \otimes Z) vec(G_A) + vec(E) ]

with:

Simulate data

To allow reproducibility, set the seed of the generator of pseudo-random numbers:

set.seed(1859)

Set dimensions:

T <- 2     # number of traits (don't change in this vignette)
I <- 100   # number of genotypes
Q <- 10    # number of replicates per genotype
N <- I * Q # number of phenotypes per trait

Genotypes

Simulate haplotypes via the coalescent with recombination, and encode the corresponding bi-allelic SNP genotypes additively as allele doses in ${0,1,2}$:

Ne <- 10^4
chrom.len <- 10^5
mu <- 10^(-8)
c <- 10^(-8)
genomes <- simulCoalescent(nb.inds=I,
                           pop.mut.rate=4 * Ne * mu * chrom.len,
                           pop.recomb.rate=4 * Ne * c * chrom.len,
                           chrom.len=chrom.len)
X <- genomes$genos
(P <- ncol(X))

Estimate additive genetic relationships:

A <- estimGenRel(X, relationships="additive", method="vanraden1", verbose=0)
summary(diag(A)) # under HWE, average should be 1
summary(A[upper.tri(A)]) # under HWE, average should be 0

Phenotypes

Choose the parameters:

mu <- c(50, 20) # global means
mean.C <- 5
sd_C <- 2
var_G_1 <- 2 # additive genotypic variance of trait 1
var_G_2 <- 4 # additive genotypic variance of trait 2
cor_G <- 0.7 # additive genotypic correlation between traits 1 and 2
cov.G <- cor_G * sqrt(var_G_1) * sqrt(var_G_2)
(V_G_A <- matrix(c(var_G_1, cov.G, cov.G, var_G_2), nrow=2, ncol=2))
kappa(V_G_A)
h2_1 <- 0.7 # var_G_1 / (var_G_1 + var_E_1 / Q)
h2_2 <- 0.3 # var_G_2 / (var_G_2 + var_E_2 / Q)
## var_E_1 <- TODO
## var_E_2 <- TODO
var_E_1 <- 5
var_E_2 <- 4
cor_E <- 0 #-0.2
cov_E <- cor_E * sqrt(var_E_1) * sqrt(var_E_2)
(V_E <- matrix(c(var_E_1, cov_E, cov_E, var_E_2), nrow=2, ncol=2))
kappa(V_E)

Simulate phenotypes:

model <- simulAnimalModel(T=T, Q=Q, mu=mu, mean.C=mean.C, sd.C=sd_C,
                          A=A, V.G.A=V_G_A, V.E=V_E)
str(model$data)
head(model$data)
regplot(model$G.A[,1], model$G.A[,2],
        xlab="G_A of trait1", ylab="G_A of trait2", main="Genotypic covariation",
        show.crit=c("corP", "R2adj"), legend.x="bottomright")
abline(v=0, h=0, col="lightgrey")

Save the truth:

truth <- list(T=T, I=I, Q=Q, N=N,
              A=A)
truth <- append(truth, model)
truthFile <- "intro-mvAM_truth.rds"
saveRDS(object=truth, file=truthFile)
tools::md5sum(truthFile)

Explore the data

Data in the "wide" format:

dat <- model$data
str(dat)
head(dat)

Reformat in the "long" format:

datLong <- data.frame(geno=rep(dat$geno, times=2),
                      trait=rep(c("response1","response2"), each=nrow(dat)),
                      year=rep(dat$year, times=2),
                      value=c(dat$response1, dat$response2),
                      stringsAsFactors=TRUE)
str(datLong)
head(datLong)

Summary statistics:

do.call(rbind, tapply(dat$response1, dat$year, summary))
do.call(rbind, tapply(dat$response2, dat$year, summary))

Plots:

hist(dat$response1, breaks="FD", col="grey", border="white", las=1,
     main="Trait 1")
hist(dat$response2, breaks="FD", col="grey", border="white", las=1,
     main="Trait 2")
regplot(dat$response1, dat$response2, asp=1,
        xlab="Trait1", ylab="Trait2", main="Phenotypic covariation",
        show.crit=c("corP", "R2adj"), legend.x="bottomright")
abline(v=mu[1], h=mu[2], col="lightgrey")
op <- par(mar=c(8,4,4,2)+0.1)
boxplot(value ~ year + trait, data=datLong, las=2,
        main="Phenotypes per trait and year", xlab="")
par(op)

Fit the model to data

Vie ReML (lme4)

Trick

if(inferPkg == "lme4"){
  st <- system.time(
      fit.lmer <- lmer(value ~ -1 + trait + year + trait:year + (trait-1|geno),
                       data=datLong,
                       control=lmerControl(optimizer="bobyqa",
                                           check.nobs.vs.nlev="ignore",
                                           check.nobs.vs.nRE="ignore")))
  print(st)
}

Via ReML (breedR)

if(inferPkg == "breedR"){
  st <- system.time(
      fit.remlf90 <- remlf90(fixed=cbind(response1, response2) ~ year,
                             generic=list(add=list(model$Z, A)),
                             data=dat))
  print(st)
}

Via ReML (sommer)

if(inferPkg == "sommer"){
  st <- system.time(
      fit.mmer <- mmer(fixed=cbind(response1, response2) ~ year,
                       random= ~ vsr(geno, Gu=A, Gtc=unsm(T)),
                       rcov= ~ vsr(units, Gtc=unsm(T)),
                       data=dat,
                       method="NR"))
  print(st)
}

Via Gibbs (rjags)

See code in the appendix of https://arxiv.org/abs/1507.08638

tmp.file.rjags <- "intro-mvAM_fit-rjags.rds"
if(inferPkg == "rjags"){
  if(! file.exists(tmp.file.rjags)){
    ## TODO
    ## forJags <- list(N=N, m=m, d=d,
    ##                 y=y[1:d,1:N], x=x[1:m,],
    ##                 b0=rep(0, m), B0=diag(0.0001, m),
    ##                 Ku=diag(1, d), Ke=diag(1, d),
    ##                 Z=rep(0, d), r=r)
    st <- system.time(
        fit.rjags <- jagsAMmv(data=tmp, relmat=list(geno.add=A),
                              nb.chains=nb.chains, nb.iters=nb.iters,
                              burnin=burnin, thin=thin,
                              rm.jags.file=TRUE, verbose=1))
    print(st)
    saveRDS(fit.rjags, tmp.file.rjags)
  } else{
    print("load tmp file...")
    fit.rjags <- readRDS(tmp.file.rjags)
  }
}

Via HMC (rstan)

tmp.file.rstan <- "intro-mvAM_fit-rstan.rds"
if(inferPkg == "rstan"){
  if(! file.exists(tmp.file.rstan)){
    nb.chains <- 1
    burnin <- 1 * 10^3
    thin <- 1
    nb.usable.iters <- 1 * 10^3
    nb.iters <- ceiling(burnin + (nb.usable.iters * thin) / nb.chains)
    tmp <- dat
    colnames(tmp)[colnames(tmp) == "geno"] <- "geno.add"
    st <- system.time(
        fit.rstan <- stanAMmv(data=tmp, relmat=list(geno.add=A),
                              nb.chains=nb.chains, nb.iters=nb.iters,
                              burnin=burnin, thin=thin,
                              task.id="intro-mvAM",
                              verbose=1,
                              rm.stan.file=TRUE, rm.sm.file=TRUE))
    print(st)
    saveRDS(fit.rstan, tmp.file.rstan)
  } else{
    print("load tmp file...")
    fit.rstan <- readRDS(tmp.file.rstan)
  }
}

Evaluate parameter estimates

From ReML (lme4)

if(inferPkg == "lme4"){
  print(summary(fit.lmer))
  print(VarCorr(fit.lmer))
  print(as.data.frame(VarCorr(fit.lmer)))
  print(truth$V.G.A)
  print(truth$V.E)
}

From ReML (breedR)

if(inferPkg == "breedR"){
  print(summary(fit.remlf90))
}

From ReML (sommer)

if(inferPkg == "sommer"){
  names(fit.mmer)
  print(fit.mmer$sigma)
  print(truth[c("V.G.A","V.E")])

  ## rmse(c(fit.mmer$var.comp$add) - c(V_G_A))
  ## rmse(fit.mmer$var.comp$add[1,1] - V_G_A[1,1])
  ## rmse(fit.mmer$var.comp$add[2,2] - V_G_A[2,2])
  ## rmse(fit.mmer$var.comp$add[1,2] - V_G_A[1,2])

  ## rmse(c(fit.mmer$var.comp$Residual) - c(V_E))
  ## rmse(fit.mmer$var.comp$Residual[1,1] - V_E[1,1])
  ## rmse(fit.mmer$var.comp$Residual[2,2] - V_E[2,2])
  ## rmse(fit.mmer$var.comp$Residual[1,2] - V_E[1,2])

  rmse(fit.mmer$u.hat$add[,1] - model$G.A[,1])
  rmse(fit.mmer$u.hat$add[,2] - model$G.A[,2])
  cor(model$G.A[,1], fit.mmer$u.hat$add[,1])
  cor(model$G.A[,2], fit.mmer$u.hat$add[,2])
  regplot(model$G.A[,1], fit.mmer$u.hat$add[,1], xlab="true G_A[,1]",
          ylab="BLUP G_A[,1]")
  regplot(model$G.A[,2], fit.mmer$u.hat$add[,2], xlab="true G_A[,2]",
          ylab="BLUP G_A[,2]")
}

From HMC (rstan)

Assess convergence:

## tmp <- grep("^corr[GE]\\[", names(fit.rstan), value=TRUE)
tmp <- c("corrG[1,1]", "corrG[1,2]", "corrG[2,2]",
         "corrE[1,1]", "corrE[1,2]", "corrE[2,2]")
rstan::traceplot(fit.rstan, pars=tmp)
print(fit.rstan, pars=tmp)

Conclusions

The ReML procedure is fast, but it does not quantify the uncertainty in variance components. It hence is required to use an additional procedures to do this, e.g. the bootstrap, which can then be computationally costly.

Appendix

t1 <- proc.time()
t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Aug. 18, 2024, 7:43 p.m.