library(knitr)
knitr::opts_chunk$set(fig.width=6, fig.height=5, fig.align="center",
                      global.par=TRUE)
par(mar=c(5, 4, 2, 0.5))

Preamble

License: CC BY-SA 4.0

References:

External packages:

library(rutilstimflutre)
library(lattice)
library(cvTools)
library(caret)
library(nlme)
library(lme4)
library(MM4LMM)
library(INLA)
library(rstan)
library(loo)

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

Heteroscedasticity in the errors

Model

Goal: study the genotype-phenotype map

Notations:

Likelihood:

[ y_{gb} = \mu + \beta_b + u_g + \epsilon_{gb} \text{ where } \epsilon_{gb} \sim \mathcal{N}(0, \sigma_b^2) \text{ and } u_g \sim \mathcal{N}(0, \sigma_u^2) ]

This model is heteroscedastic as the errors have different variances depending on the block, that is, $\sigma_{b=1}$ may be different from $\sigma_{b=2}$, etc.

With real data, it is not that rare to have large errors ("outliers"), which would call for a more robust noise model, e.g. with the Student's $t$ distribution, $\epsilon_{gb} \sim t(\nu, 0, \sigma_b^2)$ which, when the number of degrees of freedom $\nu \rightarrow + \infty$, falls back to a Normal.

Moreover, it is also frequent to have missing data, which inbalances the design.

Simulation

set.seed(1859)
B <- 3
G <- 2.5*10^2
head(geno.ids <- sprintf(fmt=paste0("g%0", floor(log10(G)) + 1, "i"), 1:G))
dat <- data.frame(block=rep(LETTERS[1:B], each=G),
                  geno=rep(geno.ids, B),
                  stringsAsFactors=TRUE)
X <- model.matrix(~ 1 + block, data=dat)
mu <- 50
(beta <- c(mu, rnorm(n=B-1, mean=0, sd=3)))
Z <- model.matrix(~ -1 + geno, data=dat)
sigma.u <- 4
u <- setNames(rnorm(n=G, mean=0, sd=sigma.u), geno.ids)
(sigma.b <- setNames(sample(1:8, size=B, replace=ifelse(8 < B, TRUE, FALSE)),
                     levels(dat$block)))
nu <- + Inf
epsilon <- do.call(c, lapply(sigma.b, function(sd.b){
  if(is.infinite(nu)){
    rnorm(n=G, mean=0, sd=sd.b)
  } else
    rt(n=G, df=nu, ncp=sd.b)
}))
dat$y <- X %*% beta + Z %*% u + epsilon
str(dat)

Don't hesitate to play by changing some values, e.g. increasing or decreasing $B$ and/or $G$, setting $\sigma_u$ at 0 or not, setting $\nu$ at $+\infty$ or not, etc.

Data exploration

Descriptive stats:

(tmp <- do.call(rbind, tapply(dat$y, dat$block, function(y.b){
  c(mean=mean(y.b), sd=sd(y.b))
})))

Visualization:

boxplot(y ~ block, data=dat, las=1, varwidth=TRUE,
        xlab="blocks", ylab="phenotypes",
        main=paste0("Simulated data (nu=", nu,
                    ", sigma_u=", format(sigma.u, digits=2), ")"))
abline(h=mean(dat$y), lty=2)

Model fit and diagnostics

Useful function

For diagnostics:

diagnose <- function(fit.name, tmp){
  message("variance of all residuals:")
  print(var(tmp$scaled.residuals))
  message("variance of the residuals per block:")
  print(tapply(tmp$scaled.residuals, tmp$block, function(res.b){
    var(res.b)
  }))
  message("outliers among all residuals:")
  print(c("<-2"=sum(tmp$scaled.residuals < - 2), ">+2"=sum(tmp$scaled.residuals > 2)))
  message("outliers among residuals per block:")
  print(do.call(rbind, tapply(tmp$scaled.residuals, tmp$block, function(res.b){
    c("<-2"=sum(res.b < -2), ">+2"=sum(res.b > 2))
  })))
  plot(x=tmp$scaled.residuals,
       y=tmp$y.hat, las=1,
       main=paste0("Diagnostics ", fit.name),
       xlab="scaled residuals",
       ylab="y.hat")
  abline(v=c(-2,0,2), lty=2)
  boxplot(scaled.residuals ~ block, data=tmp, las=1, varwidth=TRUE,
          xlab="scaled residuals", ylab="blocks", horizontal=TRUE,
          main=paste0("Diagnostics ", fit.name))
  abline(v=c(-2,0,2), lty=2)
  dotplot(geno ~ scaled.residuals, data=tmp,
          main=paste0("Diagnostics ", fit.name),
          panel=function(x,y,...){
            panel.dotplot(x,y,...)
            panel.abline(v=c(-2,0,2), lty=2)
          })
  regplot(x=tmp$y, y=tmp$y.hat,
          reg=c("lm","rlm"), col=c(lm="red",rlm="blue"), pred.int=FALSE,
          asp=1, las=1,
          main=paste0("Diagnostics ", fit.name),
          xlab="y", ylab="y.hat", legend.x="bottomright")
  abline(a=0, b=1, lty=2)
  abline(v=mean(tmp$y), h=mean(tmp$y.hat), lty=2)
}

For model selection:

cvf <- cvTools::cvFolds(n=nrow(dat), K=5, type="interleaved")
train.ctl <- caret::trainControl(method="repeatedcv", number=5,
                                 repeats=3)

For inference:

u.sort.desc <- sort(u, decreasing=TRUE)
perc.threshold <- 10
true.best.genos <- names(u.sort.desc)[1:ceiling(perc.threshold/100 * G)]
summary(u[true.best.genos])

gls

Model fit

The gls function from the nlme package performs inference via generalized least squares, but cannot handle random variables.

Assuming homoscedasticity:

fit.gls.hom <- gls(model=y ~ 1 + block, data=dat)
beta.hat.hom <- coef(fit.gls.hom)
sigma.b.hat.hom <- fit.gls.hom$sigma
scaled.residuals.hom <- residuals(fit.gls.hom) / sigma.b.hat.hom
y.hat.hom <- fitted(fit.gls.hom)
tmp.hom <- cbind(dat,
                 "scaled.residuals"=scaled.residuals.hom,
                 "y.hat"=y.hat.hom)

Assuming heteroscedasticity:

fit.gls.het <- gls(model=y ~ 1 + block, data=dat,
                   weights=varIdent(form=~1|block))
beta.hat.het <- coef(fit.gls.het)
sigma.b.hat.het <- c(1, exp(coef(fit.gls.het$modelStruct$varStruct))) *
  fit.gls.het$sigma
scaled.residuals.het <- residuals(fit.gls.het) / rep(sigma.b.hat.het, each=G)
y.hat.het <- fitted(fit.gls.het)
tmp.het <- cbind(dat,
                 "scaled.residuals"=scaled.residuals.het,
                 "y.hat"=y.hat.het)

Model comparison and selection

Clearly, the model assuming an error variance per block, i.e., heteroscedasticity, fits better than the model assuming homoscedasticity:

AIC(fit.gls.hom)
AIC(fit.gls.het)
AIC(fit.gls.hom, fit.gls.het)
BIC(fit.gls.hom)
BIC(fit.gls.het)
BIC(fit.gls.hom, fit.gls.het)
anova(fit.gls.hom, fit.gls.het)
fit.gls.het.cv <- cvFit(gls, data=dat, y=dat$y, cost=rmspe, folds=cvf)
fit.gls.het.cv
glsMeth <- list(label="Generalized least squares",
                library="nlme",
                type="Regression",
                parameters=NULL,
                grid=function(x, y, len=NULL, search){
                },
                fit=function(form, data, var.func){
                  gls(model=form, data=data, weights=var.func)
                },
                predict=function(modelFit, newdata){
                  predict(modelFit, newdata)
                },
                prob=NULL,
                sort=NULL)
fit.gls.het.cv <- train(form=y ~ 1 + block, data=dat,
                    var.func=varIdent(form=~1|block),
                    method=glsMeth,
                    trControl=train.ctl)
fit.gls.het.cv

Diagnostics

diagnose("gls (hom)", tmp.hom)
diagnose("gls (het)", tmp.het)

Inference

summary(fit.gls.hom)
cbind(beta, beta.hat.hom, confint(fit.gls.hom))
cbind(sigma.b, sigma.b.hat.hom)
(tav <- anova(fit.gls.hom))
summary(fit.gls.het)
cbind(beta, beta.hat.het, confint(fit.gls.het))
cbind(sigma.b, sigma.b.hat.het)
(tav <- anova(fit.gls.het))

Unfortunately, anova applied to a gls object does not return the mean sums of squares. In the code of nlme:::anova.gls, one can see how the F statistics are computed (Fval <- sum(c0^2)/nDF).

lmer

Model fit

The lmer function from the lme4 package performs inference via ReML and handles random variables, but cannot handle heteroscedasticity.

fit.lmer.hom <- lmer(formula=y ~ 1 + block + (1|geno), data=dat)
beta.hat <- fixef(fit.lmer.hom)
u.hat <- setNames(ranef(fit.lmer.hom)$geno[,"(Intercept)"],
                  rownames(ranef(fit.lmer.hom)$geno))[names(u)]
sigma.u.hat <- sqrt(VarCorr(fit.lmer.hom)$geno[1])
sigma(fit.lmer.hom)
scaled.residuals <- residuals(fit.lmer.hom) / sigma(fit.lmer.hom)
y.hat <- fitted(fit.lmer.hom)
tmp <- cbind(dat, scaled.residuals, y.hat)

Model comparison and selection

extractAIC(fit.lmer.hom)
BIC(fit.lmer.hom)

Diagnostics

diagnose("lmer (hom)", tmp)

Inference

summary(fit.lmer.hom)
cbind(beta, beta.hat, confint(fit.lmer.hom, parm="beta_", quiet=TRUE))
if(sigma.u > 0){
  print(c(sigma.u, sigma.u.hat))
  regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1,
          main="Diagnostics lmer", xlab="u", ylab="u.hat", legend.x="bottomright")
  abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2)
  u.hat.sort.desc <- sort(u.hat, decreasing=TRUE)
  pred.best.genos <- names(u.hat.sort.desc)[1:ceiling(0.1*G)]
  message(paste0("genotypes predicted to be among the ",
                 perc.threshold, "% best: ",
                 sum(pred.best.genos %in% true.best.genos), "/",
                 length(true.best.genos)))
}

lme

Model fit

The lme function from the nlme package performs inference via ReML, and can handle both random variables and heteroscedasticity.

fit.lme.het <- lme(fixed=y ~ 1 + block, random=~1|geno,
                   weights=varIdent(form=~1|block),
                   data=dat)
beta.hat <- fixef(fit.lme.het)
u.hat <- setNames(ranef(fit.lme.het)[,"(Intercept)"],
                  rownames(ranef(fit.lme.het)))[names(u)]
sigma.u.hat <- as.numeric(VarCorr(fit.lme.het)[1, "StdDev"])
sigma.b.hat <- c(1, exp(coef(fit.lme.het$modelStruct$varStruct))) *
  fit.lme.het$sigma
scaled.residuals <- residuals(fit.lme.het) / rep(sigma.b.hat, each=G)
y.hat <- fitted(fit.lme.het)
tmp <- cbind(dat, scaled.residuals, y.hat)

Model comparison and selection

AIC(fit.lme.het)
BIC(fit.lme.het)

Diagnostics

diagnose("lme (het)", tmp)

Inference

summary(fit.lme.het)
cbind(beta, beta.hat,
      intervals(fit.lme.het, which="fixed")$fixed[,c("lower","upper")])
cbind(sigma.b, sigma.b.hat)
if(sigma.u > 0){
  print(c(sigma.u, sigma.u.hat))
  print(c(cor(u, u.hat, method="pearson"), cor(u, u.hat, method="spearman")))
  regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1,
          main="Diagnostics lme", xlab="u", ylab="u.hat", legend.x="bottomright")
  abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2)
  u.hat.sort.desc <- sort(u.hat, decreasing=TRUE)
  pred.best.genos <- names(u.hat.sort.desc)[1:ceiling(0.1*G)]
  message(paste0("genotypes predicted to be among the ",
                 perc.threshold, "% best: ",
                 sum(pred.best.genos %in% true.best.genos), "/",
                 length(true.best.genos)))
}

MMEst

Model fit

The MMEst from the MM4LMM package performs inference via ReML, and can handle both random variables and heteroscedasticity.

Assuming homoscedasticity:

lZ <- list("geno"=Z, "Error"=diag(nrow(dat)))
lV <- list("geno"=diag(nlevels(dat$geno)),
           "Error"=diag(nrow(dat)))
fit.MMEst.hom <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="Reml")
fit.MMEst.hom.ML <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="ML")
beta.hat <- fit.MMEst.hom$NullModel$Beta
tmp <- MMBlup(Y=dat$y, Cofactor=X, ZList=lZ, VarList=lV, ResMM=fit.MMEst.hom)
u.hat <- setNames(tmp$geno[,1], colnames(Z))
sigma.u.hat <- sqrt(fit.MMEst.hom$NullModel$Sigma2[-length(fit.MMEst.hom$NullModel$Sigma2)])
(sigma.hat <- sqrt(fit.MMEst.hom$NullModel$Sigma2[length(fit.MMEst.hom$NullModel$Sigma2)]))
scaled.residuals <- tmp$Error / sigma.hat
y.hat <- X %*% beta.hat + Z %*% u.hat
tmp <- cbind(dat, scaled.residuals, y.hat)

Assuming heteroscedasticity:

lSigmaBl <- lapply(1:nlevels(dat$block), function(bl){
  tmp <- diag(as.numeric(dat$block))
  diag(tmp)[diag(tmp) != bl] <- 0
  diag(tmp)[diag(tmp) == bl] <- 1
  tmp
})
names(lSigmaBl) <- paste0("ErrorBl", 1:nlevels(dat$block))
lZ <- append(list("geno"=Z),
             lapply(lSigmaBl, function(x){diag(1:nrow(dat))}))
lV <- append(list("geno"=diag(nlevels(dat$geno))),
             lSigmaBl)
fit.MMEst.het <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="Reml")
fit.MMEst.het.ML <- MMEst(Y=dat$y, Cofactor=X, VarList=lV, ZList=lZ, Method="ML")

Model comparison and selection

## AIC(fit.MMEst.het)
BIC_MMEst <- function(fit, N, n_tot){
  ## https://hal.archives-ouvertes.fr/hal-00991708
  ## BIC = - 2 LL + pen
  ## LL: maximized log-likelihood (ML)
  ## pen: product of the number of estimated parameters and the logarithm of the sample size
  stopifnot(fit$Method == "ML")
  len_theta_R <- length(fit$Sigma2) - 1 # ignore the error term
  len_theta_F <- length(fit$Beta)
  - 2 * fit$LogLik + len_theta_R * log(N) + len_theta_F * log(n_tot)
}
BIC_MMEst(fit.MMEst.hom.ML$NullModel, N=ncol(Z), n_tot=nrow(dat))
BIC_MMEst(fit.MMEst.het.ML$NullModel, N=ncol(Z), n_tot=nrow(dat))

Diagnostics

diagnose("MMEst", tmp)

Inference

## summary(fit.MMEst.het)
cbind(beta, beta.hat)
print(c(sigma.u, sigma.u.hat))

inla

Model fit

The inla function from the INLA package performs inference via the "integrated nested Laplace approximation", and can handle both random variables and heteroscedasticity.

dat.inla <- list(block=dat$block,
                 geno=dat$geno,
                 Y=matrix(NA, nrow=nrow(dat), ncol=nlevels(dat$block)))
for(b in 1:nlevels(dat$block)){
  block <- levels(dat$block)[b]
  idx <- which(dat$block == block)
  dat.inla$Y[idx, b] <- dat$y[idx]
}
fit.inla <- inla(formula=Y ~ 1 + block + f(geno, model="iid"),
                 data=dat.inla,
                 family=rep("gaussian", nlevels(dat$block)),
                 control.compute=list(waic=TRUE))
beta.hat <- fit.inla$summary.fixed[,"mean"]
u.hat <- setNames(fit.inla$summary.random$geno[,"mean"],
                  fit.inla$summary.random$geno[,"ID"])
sigma.u.hat <- 1 / sqrt(fit.inla$summary.hyperpar[B+1, "mean"])
sigma.b.hat <- 1 / sqrt(fit.inla$summary.hyperpar[1:B, "mean"])
y.hat <- fit.inla$summary.linear.predictor[,"mean"]
residuals <- dat$y - y.hat
scaled.residuals <- residuals / rep(sigma.b.hat, each=G)
tmp <- cbind(dat, scaled.residuals, y.hat)

Model comparison and selection

c(fit.inla$waic$p.eff, fit.inla$waic$waic)

Diagnostics

diagnose("inla", tmp)

Inference

summary(fit.inla)
cbind(beta, beta.hat,
      do.call(rbind, lapply(fit.inla$marginals.fixed, function(x){
        matrix(inla.hpdmarginal(0.95, x), 1, 2,
               dimnames=list(NULL, c("2.5 %"," 97.5 %")))
      })))
cbind(sigma.b, sigma.b.hat,
      do.call(rbind, lapply(fit.inla$marginals.hyperpar[1:B], function(x){
        setNames(rev(1 / sqrt(inla.hpdmarginal(0.95, x))),
                 c("2.5 %"," 97.5 %"))
      })))
if(sigma.u > 0){
  print(setNames(c(sigma.u, sigma.u.hat,
                   rev(1 / sqrt(inla.hpdmarginal(0.95,
                                                 fit.inla$marginals.hyperpar[[B+1]])))),
                 c("sigma.u", "sigma.u.hat", "2.5 %", " 97.5 %")))
  print(c(cor(u, u.hat, method="pearson"), cor(u, u.hat, method="spearman")))
  regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1,
          main="Diagnostics inla", xlab="u", ylab="u.hat", legend.x="bottomright")
  abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2)
  u.hat.sort.desc <- sort(u.hat, decreasing=TRUE)
  pred.best.genos <- names(u.hat.sort.desc)[1:ceiling(0.1*G)]
  message(paste0("genotypes predicted to be among the ",
                 perc.threshold, "% best: ",
                 sum(pred.best.genos %in% true.best.genos), "/",
                 length(true.best.genos)))
}

stan

Model fit

The stan function from the rstan package performs inference via HMC, and can handle both random variables and heteroscedasticity.

stan.model.spec <- "
data {
  int<lower=0> N;
  int<lower=0> G;
  int<lower=0> B;
  vector[N] y;
  matrix[N, B] X;
  matrix[N, G] Z;
  real loc_mu;
  real<lower=0> scale_mu;
  real<lower=0> nu;
  int idx_block[N];
}
parameters {
  vector[B] beta;
  vector[G] u;
  vector[B] sigma_e;
  real<lower=0> sigma_u;
}
model {
  int idx_lo;
  int idx_up;
  vector[G] mean_u;
  mean_u = rep_vector(0, G);

  beta[1] ~ cauchy(loc_mu, scale_mu);
  for(b in 2:B)
    beta[b] ~ cauchy(0, 5);
  sigma_u ~ cauchy(0, 5);
  u ~ normal(mean_u, sigma_u);
  for(b in 1:B){
    sigma_e[b] ~ cauchy(0, 5);
    idx_lo = 1 + (b-1) * G;
    idx_up = (b-1) * G + G;
    if(is_inf(nu)){
      y[idx_lo:idx_up] ~ normal(X[idx_lo:idx_up,] * beta + Z[idx_lo:idx_up,] * u,
                                sigma_e[b]);
    } else
      y[idx_lo:idx_up] ~ student_t(nu, X[idx_lo:idx_up,] * beta + Z[idx_lo:idx_up,] * u,
                                   sigma_e[b]);
  }
}
generated quantities {
  vector[N] fitted;
  vector[N] log_lik;
  fitted = X * beta + Z * u;
  for(n in 1:N){
    if(is_inf(nu)){
      log_lik[n] = normal_lpdf(y[n] | X[n,] * beta + Z[n,] * u,
                                      sigma_e[idx_block[n]]);
    } else
      log_lik[n] = student_t_lpdf(y[n] | nu, X[n,] * beta + Z[n,] * u,
                                         sigma_e[idx_block[n]]);
  }
}
"

dat.stan <- list(N=G*B, G=G, B=B, y=dat$y[,1], X=X, Z=Z,
                 loc_mu=0, scale_mu=mean(dat$y), nu=5,
                 idx_block=as.numeric(dat$block))
nb.chains <- 2
burnin <- 1 * 10^3
thin <- 10
nb.usable.iters <- 2 * 10^3
nb.iters <- ceiling(burnin + (nb.usable.iters * thin) / nb.chains)
fit.stan <- stan(data=dat.stan, model_code=stan.model.spec,
                 chains=nb.chains, iter=nb.iters, warmup=burnin, thin=thin,
                 verbose=FALSE, refresh=0)
beta.hat <- colMeans(extract(fit.stan, pars="beta")$beta)
u.hat <- colMeans(extract(fit.stan, pars="u")$u)
sigma.u.hat <- mean(extract(fit.stan, pars="sigma_u")$sigma_u)
sigma.b.hat <- colMeans(extract(fit.stan, pars="sigma_e")$sigma_e)
y.hat <- colMeans(extract(fit.stan, "fitted")$fitted)
residuals <- dat$y - y.hat
scaled.residuals <- residuals / rep(sigma.b.hat, each=G)
tmp <- cbind(dat, scaled.residuals, y.hat)

The shinystan package can help a lot when inspecting the output fit.stan to check that the chains converged.

To use a robust noise model, we can simply replace normal by student_t, with the number of degrees of freedom being passed as data.

To jointly estimate parameters and predict (impute) missing responses, we can simply add a second response vector: y_miss ~ normal(X_miss * beta + Z_miss * u, sigma_e[b]).

Model comparison and selection

loo(extract_log_lik(fit.stan))
waic(extract_log_lik(fit.stan))

Diagnostics

Check first if we have convergence (the true value is indicated in red on the plots):

summary(summary(fit.stan)$summary[, c("n_eff","Rhat")])
res.mcmc <- As.mcmc.list(object=fit.stan,
                         pars=names(fit.stan)[c(grep("beta", names(fit.stan)),
                                                grep("sigma", names(fit.stan)))])
plotMcmcChain(res.mcmc[[1]], "sigma_u", pe=sigma.u)
for(b in 1:B)
  plotMcmcChain(res.mcmc[[1]], paste0("sigma_e[",b,"]"), pe=sigma.b[b])
diagnose("stan", tmp)

Inference

cbind(beta, beta.hat)
cbind(sigma.b, sigma.b.hat)
c(sigma.u, sigma.u.hat)
c(cor(u, u.hat, method="pearson"), cor(u, u.hat, method="spearman"))
print(fit.stan,
      pars=names(fit.stan)[c(grep("beta", names(fit.stan)),
                             grep("sigma", names(fit.stan)))],
      probs=c(0.025, 0.5, 0.95))
plot(fit.stan,
     pars=names(fit.stan)[grep("sigma", names(fit.stan))])
pairs(fit.stan,
      pars=names(fit.stan)[c(grep("beta", names(fit.stan)),
                             grep("sigma", names(fit.stan)))])
regplot(x=u, y=u.hat, pred.int=FALSE, asp=1, las=1,
        main="Diagnostics stan", xlab="u", ylab="u.hat", legend.x="bottomright")
abline(a=0, b=1, lty=2); abline(v=0, h=0, lty=2)

Heteroscedasticity in a multi-level predictor

Model

TODO

Simulation

TODO

Data exploration

TODO

Model fit and diagnostics

TODO

Appendix

t1 <- proc.time()
t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Feb. 12, 2025, 11:35 p.m.