library(knitr) knitr::opts_chunk$set(fig.width=6, fig.height=5, fig.align="center", global.par=TRUE) par(mar=c(5, 4, 2, 0.5))
License: CC BY-SA 4.0
References:
Pinheiro & Bates (2000): "Mixed-effects models in S and S-PLUS"
Galecki and Burzykowski (2013): "Linear mixed-effects models using R: a step-by-step approach"
Martins et al (2013): "Bayesian computing with INLA: New features"
Carpenter et al (2016): "Stan: a probabilistic programming language"
External packages:
library(rutilstimflutre) library(lattice) library(cvTools) library(caret) library(nlme) library(lme4) library(MM4LMM) library(INLA) library(rstan) library(loo)
This R chunk is used to assess how much time it takes to execute the R code in this document until the end:
t0 <- proc.time()
Goal: study the genotype-phenotype map
Notations:
$g \in {1,\ldots,G}$: index of the genotype (assuming a plant species whose genotypes can be cloned and planted in multiple blocks), assuming $G$ is "large" (say, 100)
$b \in {1,\ldots,B}$: index of the block (field plots often are divided into multiple blocks to handle spatial heterogeneity), assuming $B$ is "small" (say, 3)
$y_{gb}$: observed value (phenotype) of the $g$-th genotype in the $b$-th block
$\beta_b$: fixed-effect parameter of the $b$-th block
$\mu$: global intercept
$u_g$: random variable corresponding to the $g$-th genotype
Likelihood:
[ y_{gb} = \mu + \beta_b + u_g + \epsilon_{gb} \text{ where } \epsilon_{gb} \sim \mathcal{N}(0, \sigma_b^2) \text{ and } u_g \sim \mathcal{N}(0, \sigma_u^2) ]
This model is heteroscedastic as the errors have different variances depending on the block, that is, $\sigma_{b=1}$ may be different from $\sigma_{b=2}$, etc.
With real data, it is not that rare to have large errors ("outliers"), which would call for a more robust noise model, e.g. with the Student's $t$ distribution, $\epsilon_{gb} \sim t(\nu, 0, \sigma_b^2)$ which, when the number of degrees of freedom $\nu \rightarrow + \infty$, falls back to a Normal.
Moreover, it is also frequent to have missing data, which inbalances the design.
set.seed(1859) B <- 3 G <- 2.5*10^2 head(geno.ids <- sprintf(fmt=paste0("g%0", floor(log10(G)) + 1, "i"), 1:G)) dat <- data.frame(block=rep(LETTERS[1:B], each=G), geno=rep(geno.ids, B), stringsAsFactors=TRUE) X <- model.matrix(~ 1 + block, data=dat) mu <- 50 (beta <- c(mu, rnorm(n=B-1, mean=0, sd=3))) Z <- model.matrix(~ -1 + geno, data=dat) sigma.u <- 4 u <- setNames(rnorm(n=G, mean=0, sd=sigma.u), geno.ids) (sigma.b <- setNames(sample(1:8, size=B, replace=ifelse(8 < B, TRUE, FALSE)), levels(dat$block))) nu <- + Inf epsilon <- do.call(c, lapply(sigma.b, function(sd.b){ if(is.infinite(nu)){ rnorm(n=G, mean=0, sd=sd.b) } else rt(n=G, df=nu, ncp=sd.b) })) dat$y <- X %*% beta + Z %*% u + epsilon str(dat)
Don't hesitate to play by changing some values, e.g. increasing or decreasing $B$ and/or $G$, setting $\sigma_u$ at 0 or not, setting $\nu$ at $+\infty$ or not, etc.
Descriptive stats:
(tmp <- do.call(rbind, tapply(dat$y, dat$block, function(y.b){ c(mean=mean(y.b), sd=sd(y.b)) })))
Visualization:
boxplot(y ~ block, data=dat, las=1, varwidth=TRUE, xlab="blocks", ylab="phenotypes", main=paste0("Simulated data (nu=", nu, ", sigma_u=", format(sigma.u, digits=2), ")")) abline(h=mean(dat$y), lty=2)
For diagnostics:
diagnose <- function(fit.name, tmp){ message("variance of all residuals:") print(var(tmp$scaled.residuals)) message("variance of the residuals per block:") print(tapply(tmp$scaled.residuals, tmp$block, function(res.b){ var(res.b) })) message("outliers among all residuals:") print(c("<-2"=sum(tmp$scaled.residuals < - 2), ">+2"=sum(tmp$scaled.residuals > 2))) message("outliers among residuals per block:") print(do.call(rbind, tapply(tmp$scaled.residuals, tmp$block, function(res.b){ c("<-2"=sum(res.b < -2), ">+2"=sum(res.b > 2)) }))) plot(x=tmp$scaled.residuals, y=tmp$y.hat, las=1, main=paste0("Diagnostics ", fit.name), xlab="scaled residuals", ylab="y.hat") abline(v=c(-2,0,2), lty=2) boxplot(scaled.residuals ~ block, data=tmp, las=1, varwidth=TRUE, xlab="scaled residuals", ylab="blocks", horizontal=TRUE, main=paste0("Diagnostics ", fit.name)) abline(v=c(-2,0,2), lty=2) dotplot(geno ~ scaled.residuals, data=tmp, main=paste0("Diagnostics ", fit.name), panel=function(x,y,...){ panel.dotplot(x,y,...) panel.abline(v=c(-2,0,2), lty=2) }) regplot(x=tmp$y, y=tmp$y.hat, reg=c("lm","rlm"), col=c(lm="red",rlm="blue"), pred.int=FALSE, asp=1, las=1, main=paste0("Diagnostics ", fit.name), xlab="y", ylab="y.hat", legend.x="bottomright") abline(a=0, b=1, lty=2) abline(v=mean(tmp$y), h=mean(tmp$y.hat), lty=2) }
For model selection:
cvf <- cvTools::cvFolds(n=nrow(dat), K=5, type="interleaved") train.ctl <- caret::trainControl(method="repeatedcv", number=5, repeats=3)
For inference:
u.sort.desc <- sort(u, decreasing=TRUE) perc.threshold <- 10 true.best.genos <- names(u.sort.desc)[1:ceiling(perc.threshold/100 * G)] summary(u[true.best.genos])
gls
The gls
function from the nlme
package performs inference via generalized least squares, but cannot handle random variables.
Assuming homoscedasticity:
fit.gls.hom <- gls(model=y ~ 1 + block, data=dat) beta.hat.hom <- coef(fit.gls.hom) sigma.b.hat.hom <- fit.gls.hom$sigma scaled.residuals.hom <- residuals(fit.gls.hom) / sigma.b.hat.hom y.hat.hom <- fitted(fit.gls.hom) tmp.hom <- cbind(dat, "scaled.residuals"=scaled.residuals.hom, "y.hat"=y.hat.hom)
Assuming heteroscedasticity:
fit.gls.het <- gls(model=y ~ 1 + block, data=dat, weights=varIdent(form=~1|block)) beta.hat.het <- coef(fit.gls.het) sigma.b.hat.het <- c(1, exp(coef(fit.gls.het$modelStruct$varStruct))) * fit.gls.het$sigma scaled.residuals.het <- residuals(fit.gls.het) / rep(sigma.b.hat.het, each=G) y.hat.het <- fitted(fit.gls.het) tmp.het <- cbind(dat, "scaled.residuals"=scaled.residuals.het, "y.hat"=y.hat.het)
Clearly, the model assuming an error variance per block, i.e., heteroscedasticity, fits better than the model assuming homoscedasticity:
AIC(fit.gls.hom) AIC(fit.gls.het) AIC(fit.gls.hom, fit.gls.het)
BIC(fit.gls.hom) BIC(fit.gls.het) BIC(fit.gls.hom, fit.gls.het)
anova(fit.gls.hom, fit.gls.het)
fit.gls.het.cv <- cvFit(gls, data=dat, y=dat$y, cost=rmspe, folds=cvf) fit.gls.het.cv glsMeth <- list(label="Generalized least squares", library="nlme", type="Regression", parameters=NULL, grid=function(x, y, len=NULL, search){ }, fit=function(form, data, var.func){ gls(model=form, data=data, weights=var.func) }, predict=function(modelFit, newdata){ predict(modelFit, newdata) }, prob=NULL, sort=NULL) fit.gls.het.cv <- train(form=y ~ 1 + block, data=dat, var.func=varIdent(form=~1|block), method=glsMeth, trControl=train.ctl) fit.gls.het.cv
diagnose("gls (hom)", tmp.hom) diagnose("gls (het)", tmp.het)
summary(fit.gls.hom) cbind(beta, beta.hat.hom, confint(fit.gls.hom)) cbind(sigma.b, sigma.b.hat.hom) (tav <- anova(fit.gls.hom))
summary(fit.gls.het) cbind(beta, beta.hat.het, confint(fit.gls.het)) cbind(sigma.b, sigma.b.hat.het) (tav <- anova(fit.gls.het))
Unfortunately, anova
applied to a gls
object does not return the mean sums of squares.
In the code of nlme:::anova.gls
, one can see how the F statistics are computed (Fval <- sum(c0^2)/nDF
).
lmer
The lmer
function from the lme4
package performs inference via ReML and handles random variables, but cannot handle heteroscedasticity.
fit.lmer.hom <- lmer(formula=y ~ 1 + block + (1|geno), data=dat) beta.hat <- fixef(fit.lmer.hom) u.hat <- setNames(ranef(fit.lmer.hom)$geno[,"(Intercept)"], rownames(ranef(fit.lmer.hom)$geno))[names(u)] sigma.u.hat <- sqrt(VarCorr(fit.lmer.hom)$geno[1]) sigma(fit.lmer.hom) scaled.residuals <- residuals(fit.lmer.hom) / sigma(fit.lmer.hom) y.hat <- fitted(fit.lmer.hom) tmp <- cbind(dat, scaled.residuals, y.hat)
extractAIC(fit.lmer.hom) BIC(fit.lmer.hom)
diagnose("lmer (hom)", tmp)
summary(fit.lmer.hom) cbind(beta, beta.hat, confint(fit.lmer.hom, parm="beta_", quiet=TRUE)) if(sigma.u > 0){ print(c(sigma.u, sigma.u.hat)) regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1, main="Diagnostics lmer", xlab="u", ylab="u.hat", legend.x="bottomright") abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2) u.hat.sort.desc <- sort(u.hat, decreasing=TRUE) pred.best.genos <- names(u.hat.sort.desc)[1:ceiling(0.1*G)] message(paste0("genotypes predicted to be among the ", perc.threshold, "% best: ", sum(pred.best.genos %in% true.best.genos), "/", length(true.best.genos))) }
lme
The lme
function from the nlme
package performs inference via ReML, and can handle both random variables and heteroscedasticity.
fit.lme.het <- lme(fixed=y ~ 1 + block, random=~1|geno, weights=varIdent(form=~1|block), data=dat) beta.hat <- fixef(fit.lme.het) u.hat <- setNames(ranef(fit.lme.het)[,"(Intercept)"], rownames(ranef(fit.lme.het)))[names(u)] sigma.u.hat <- as.numeric(VarCorr(fit.lme.het)[1, "StdDev"]) sigma.b.hat <- c(1, exp(coef(fit.lme.het$modelStruct$varStruct))) * fit.lme.het$sigma scaled.residuals <- residuals(fit.lme.het) / rep(sigma.b.hat, each=G) y.hat <- fitted(fit.lme.het) tmp <- cbind(dat, scaled.residuals, y.hat)
AIC(fit.lme.het) BIC(fit.lme.het)
diagnose("lme (het)", tmp)
summary(fit.lme.het) cbind(beta, beta.hat, intervals(fit.lme.het, which="fixed")$fixed[,c("lower","upper")]) cbind(sigma.b, sigma.b.hat) if(sigma.u > 0){ print(c(sigma.u, sigma.u.hat)) print(c(cor(u, u.hat, method="pearson"), cor(u, u.hat, method="spearman"))) regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1, main="Diagnostics lme", xlab="u", ylab="u.hat", legend.x="bottomright") abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2) u.hat.sort.desc <- sort(u.hat, decreasing=TRUE) pred.best.genos <- names(u.hat.sort.desc)[1:ceiling(0.1*G)] message(paste0("genotypes predicted to be among the ", perc.threshold, "% best: ", sum(pred.best.genos %in% true.best.genos), "/", length(true.best.genos))) }
MMEst
The MMEst
from the MM4LMM
package performs inference via ReML, and can handle both random variables and heteroscedasticity.
Assuming homoscedasticity:
lZ <- list("geno"=Z, "Error"=diag(nrow(dat))) lV <- list("geno"=diag(nlevels(dat$geno)), "Error"=diag(nrow(dat))) fit.MMEst.hom <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="Reml") fit.MMEst.hom.ML <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="ML") beta.hat <- fit.MMEst.hom$NullModel$Beta tmp <- MMBlup(Y=dat$y, Cofactor=X, ZList=lZ, VarList=lV, ResMM=fit.MMEst.hom) u.hat <- setNames(tmp$geno[,1], colnames(Z)) sigma.u.hat <- sqrt(fit.MMEst.hom$NullModel$Sigma2[-length(fit.MMEst.hom$NullModel$Sigma2)]) (sigma.hat <- sqrt(fit.MMEst.hom$NullModel$Sigma2[length(fit.MMEst.hom$NullModel$Sigma2)])) scaled.residuals <- tmp$Error / sigma.hat y.hat <- X %*% beta.hat + Z %*% u.hat tmp <- cbind(dat, scaled.residuals, y.hat)
Assuming heteroscedasticity:
lSigmaBl <- lapply(1:nlevels(dat$block), function(bl){ tmp <- diag(as.numeric(dat$block)) diag(tmp)[diag(tmp) != bl] <- 0 diag(tmp)[diag(tmp) == bl] <- 1 tmp }) names(lSigmaBl) <- paste0("ErrorBl", 1:nlevels(dat$block)) lZ <- append(list("geno"=Z), lapply(lSigmaBl, function(x){diag(1:nrow(dat))})) lV <- append(list("geno"=diag(nlevels(dat$geno))), lSigmaBl) fit.MMEst.het <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="Reml") fit.MMEst.het.ML <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="ML")
## AIC(fit.MMEst.het) BIC_MMEst <- function(fit, N, n_tot){ ## https://hal.archives-ouvertes.fr/hal-00991708 ## BIC = - 2 LL + pen ## LL: maximized log-likelihood (ML) ## pen: product of the number of estimated parameters and the logarithm of the sample size stopifnot(fit$Method == "ML") len_theta_R <- length(fit$Sigma2) - 1 # ignore the error term len_theta_F <- length(fit$Beta) - 2 * fit$LogLik + len_theta_R * log(N) + len_theta_F * log(n_tot) } BIC_MMEst(fit.MMEst.hom.ML$NullModel, N=ncol(Z), n_tot=nrow(dat)) BIC_MMEst(fit.MMEst.het.ML$NullModel, N=ncol(Z), n_tot=nrow(dat))
diagnose("MMEst", tmp)
## summary(fit.MMEst.het) cbind(beta, beta.hat) print(c(sigma.u, sigma.u.hat))
inla
The inla
function from the INLA
package performs inference via the "integrated nested Laplace approximation", and can handle both random variables and heteroscedasticity.
dat.inla <- list(block=dat$block, geno=dat$geno, Y=matrix(NA, nrow=nrow(dat), ncol=nlevels(dat$block))) for(b in 1:nlevels(dat$block)){ block <- levels(dat$block)[b] idx <- which(dat$block == block) dat.inla$Y[idx, b] <- dat$y[idx] } fit.inla <- inla(formula=Y ~ 1 + block + f(geno, model="iid"), data=dat.inla, family=rep("gaussian", nlevels(dat$block)), control.compute=list(waic=TRUE)) beta.hat <- fit.inla$summary.fixed[,"mean"] u.hat <- setNames(fit.inla$summary.random$geno[,"mean"], fit.inla$summary.random$geno[,"ID"]) sigma.u.hat <- 1 / sqrt(fit.inla$summary.hyperpar[B+1, "mean"]) sigma.b.hat <- 1 / sqrt(fit.inla$summary.hyperpar[1:B, "mean"]) y.hat <- fit.inla$summary.linear.predictor[,"mean"] residuals <- dat$y - y.hat scaled.residuals <- residuals / rep(sigma.b.hat, each=G) tmp <- cbind(dat, scaled.residuals, y.hat)
c(fit.inla$waic$p.eff, fit.inla$waic$waic)
diagnose("inla", tmp)
summary(fit.inla) cbind(beta, beta.hat, do.call(rbind, lapply(fit.inla$marginals.fixed, function(x){ matrix(inla.hpdmarginal(0.95, x), 1, 2, dimnames=list(NULL, c("2.5 %"," 97.5 %"))) }))) cbind(sigma.b, sigma.b.hat, do.call(rbind, lapply(fit.inla$marginals.hyperpar[1:B], function(x){ setNames(rev(1 / sqrt(inla.hpdmarginal(0.95, x))), c("2.5 %"," 97.5 %")) }))) if(sigma.u > 0){ print(setNames(c(sigma.u, sigma.u.hat, rev(1 / sqrt(inla.hpdmarginal(0.95, fit.inla$marginals.hyperpar[[B+1]])))), c("sigma.u", "sigma.u.hat", "2.5 %", " 97.5 %"))) print(c(cor(u, u.hat, method="pearson"), cor(u, u.hat, method="spearman"))) regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1, main="Diagnostics inla", xlab="u", ylab="u.hat", legend.x="bottomright") abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2) u.hat.sort.desc <- sort(u.hat, decreasing=TRUE) pred.best.genos <- names(u.hat.sort.desc)[1:ceiling(0.1*G)] message(paste0("genotypes predicted to be among the ", perc.threshold, "% best: ", sum(pred.best.genos %in% true.best.genos), "/", length(true.best.genos))) }
stan
The stan
function from the rstan
package performs inference via HMC, and can handle both random variables and heteroscedasticity.
stan.model.spec <- " data { int<lower=0> N; int<lower=0> G; int<lower=0> B; vector[N] y; matrix[N, B] X; matrix[N, G] Z; real loc_mu; real<lower=0> scale_mu; real<lower=0> nu; int idx_block[N]; } parameters { vector[B] beta; vector[G] u; vector[B] sigma_e; real<lower=0> sigma_u; } model { int idx_lo; int idx_up; vector[G] mean_u; mean_u = rep_vector(0, G); beta[1] ~ cauchy(loc_mu, scale_mu); for(b in 2:B) beta[b] ~ cauchy(0, 5); sigma_u ~ cauchy(0, 5); u ~ normal(mean_u, sigma_u); for(b in 1:B){ sigma_e[b] ~ cauchy(0, 5); idx_lo = 1 + (b-1) * G; idx_up = (b-1) * G + G; if(is_inf(nu)){ y[idx_lo:idx_up] ~ normal(X[idx_lo:idx_up,] * beta + Z[idx_lo:idx_up,] * u, sigma_e[b]); } else y[idx_lo:idx_up] ~ student_t(nu, X[idx_lo:idx_up,] * beta + Z[idx_lo:idx_up,] * u, sigma_e[b]); } } generated quantities { vector[N] fitted; vector[N] log_lik; fitted = X * beta + Z * u; for(n in 1:N){ if(is_inf(nu)){ log_lik[n] = normal_lpdf(y[n] | X[n,] * beta + Z[n,] * u, sigma_e[idx_block[n]]); } else log_lik[n] = student_t_lpdf(y[n] | nu, X[n,] * beta + Z[n,] * u, sigma_e[idx_block[n]]); } } " dat.stan <- list(N=G*B, G=G, B=B, y=dat$y[,1], X=X, Z=Z, loc_mu=0, scale_mu=mean(dat$y), nu=5, idx_block=as.numeric(dat$block)) nb.chains <- 2 burnin <- 1 * 10^3 thin <- 10 nb.usable.iters <- 2 * 10^3 nb.iters <- ceiling(burnin + (nb.usable.iters * thin) / nb.chains) fit.stan <- stan(data=dat.stan, model_code=stan.model.spec, chains=nb.chains, iter=nb.iters, warmup=burnin, thin=thin, verbose=FALSE, refresh=0) beta.hat <- colMeans(extract(fit.stan, pars="beta")$beta) u.hat <- colMeans(extract(fit.stan, pars="u")$u) sigma.u.hat <- mean(extract(fit.stan, pars="sigma_u")$sigma_u) sigma.b.hat <- colMeans(extract(fit.stan, pars="sigma_e")$sigma_e) y.hat <- colMeans(extract(fit.stan, "fitted")$fitted) residuals <- dat$y - y.hat scaled.residuals <- residuals / rep(sigma.b.hat, each=G) tmp <- cbind(dat, scaled.residuals, y.hat)
The shinystan package can help a lot when inspecting the output fit.stan
to check that the chains converged.
To use a robust noise model, we can simply replace normal
by student_t
, with the number of degrees of freedom being passed as data.
To jointly estimate parameters and predict (impute) missing responses, we can simply add a second response vector: y_miss ~ normal(X_miss * beta + Z_miss * u, sigma_e[b])
.
loo(extract_log_lik(fit.stan)) waic(extract_log_lik(fit.stan))
Check first if we have convergence (the true value is indicated in red on the plots):
summary(summary(fit.stan)$summary[, c("n_eff","Rhat")]) res.mcmc <- As.mcmc.list(object=fit.stan, pars=names(fit.stan)[c(grep("beta", names(fit.stan)), grep("sigma", names(fit.stan)))]) plotMcmcChain(res.mcmc[[1]], "sigma_u", pe=sigma.u) for(b in 1:B) plotMcmcChain(res.mcmc[[1]], paste0("sigma_e[",b,"]"), pe=sigma.b[b])
diagnose("stan", tmp)
cbind(beta, beta.hat) cbind(sigma.b, sigma.b.hat) c(sigma.u, sigma.u.hat) c(cor(u, u.hat, method="pearson"), cor(u, u.hat, method="spearman")) print(fit.stan, pars=names(fit.stan)[c(grep("beta", names(fit.stan)), grep("sigma", names(fit.stan)))], probs=c(0.025, 0.5, 0.95)) plot(fit.stan, pars=names(fit.stan)[grep("sigma", names(fit.stan))]) pairs(fit.stan, pars=names(fit.stan)[c(grep("beta", names(fit.stan)), grep("sigma", names(fit.stan)))]) regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1, main="Diagnostics stan", xlab="u", ylab="u.hat", legend.x="bottomright") abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2)
TODO
TODO
TODO
TODO
t1 <- proc.time() t1 - t0 print(sessionInfo(), locale=FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.