suppressPackageStartupMessages(library(knitr))
if(! "params" %in% ls())
  params <- list(root.dir=NULL)
opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE,
               fig.align="center", root.dir=params$root.dir)
opts_knit$set(progress=TRUE, verbose=TRUE)

Overview

This document aims at exploring two-phase analysis in quantitative genetics as seen with perennial plant species, e.g., grapevine, depending on the amount of missing data, that is when the data set is more or less unbalanced, and the discrepancy between the generative and statistical models.

This document requires external packages to be available:

suppressPackageStartupMessages(library(parallel))
nb.cores <- ifelse(detectCores() > 20, 20, detectCores() - 1)
suppressPackageStartupMessages(library(scrm))
suppressPackageStartupMessages(library(sp))
suppressPackageStartupMessages(library(randtoolbox))
## suppressPackageStartupMessages(library(DiceDesign))
suppressPackageStartupMessages(library(gstat))
suppressPackageStartupMessages(library(beanplot))
suppressPackageStartupMessages(library(lme4))
suppressPackageStartupMessages(library(MuMIn))
suppressPackageStartupMessages(library(boot))
suppressPackageStartupMessages(library(lattice))
suppressPackageStartupMessages(library(lmerTest))
stopifnot(compareVersion("3.0",
                         as.character(packageVersion("lmerTest")))
          != 1)
suppressPackageStartupMessages(library(emmeans))
suppressPackageStartupMessages(library(MM4LMM))
suppressPackageStartupMessages(library(sommer))
## suppressPackageStartupMessages(library(mlmm.gwas))
## suppressPackageStartupMessages(library(varbvs))
## stopifnot(compareVersion("2.5.7",
##                          as.character(packageVersion("varbvs")))
##           != 1)
suppressPackageStartupMessages(library(rutilstimflutre)) # on GitHub
stopifnot(compareVersion("0.170.1",
                         as.character(packageVersion("rutilstimflutre")))
          != 1)
## stopifnot(file.exists(Sys.which("gemma"))) # https://github.com/genetics-statistics/GEMMA/

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

Generative model

Phenotypic level

Dimensions:

Variables:

Genotypic level

Strategy

No unified statistical model can be currently used to reliably perform inference of SNP effects in the presence of genotype-year interactions and spatial heterogeneity. As a result, a two-phase strategy should be used.

Phases:

  1. estimate/predict $\boldsymbol{g}$ as "average" $\boldsymbol{y}$ per genotype;

  2. regress $\hat{\boldsymbol{g}}$ on $M$ to infer $\boldsymbol{\alpha}$ and $\boldsymbol{\gamma_a}$ (and $\boldsymbol{\delta}$ and $\boldsymbol{\gamma_d}$ ).

Questions:

Simulation of genetic data

Initial population

Set the seed:

set.seed(1859)

Simulate haplotypes and genotypes in one population at the Hardy-Weinberg equilibrium:

nb.genos <- 1 * 10^3
nb.chrs <- 10
chr.len.phy <- 10^5     # chromosome physical length in base pairs
mu <- 5 * 10^(-8)       # neutral mutation rate in events / base / generation
u <- mu * chr.len.phy   #  in events / chrom / gen
chr.len.gen <- 10^(-1)  # chromosome genetic length in Morgans
c.r <- chr.len.gen / chr.len.phy # recomb rate in events / base / gen
r <- c.r * chr.len.phy  #  in events / chrom / gen
Ne <- 10^4              # effective population size
theta <- 4 * Ne * u     # scaled neutral mutation rate in events / chrom
rho <- 4 * Ne * r       # scaled recomb rate in events / chrom
p2f <- "two-phase_quantgen_genomes.RData"
if(! file.exists(p2f)){
  st <- system.time(
      genomes <- simulCoalescent(nb.inds=nb.genos,
                                 nb.reps=nb.chrs,
                                 pop.mut.rate=theta,
                                 pop.recomb.rate=rho,
                                 chrom.len=chr.len.phy,
                                 get.alleles=TRUE))
  print(st)
  save(genomes, file=p2f)
  print(tools::md5sum(path.expand(p2f)))
} else{
  print(tools::md5sum(path.expand(p2f)))
  load(p2f)
}
afs.pop <- estimSnpAf(X=genomes$genos)
mafs.pop <- estimSnpMaf(afs=afs.pop)
A.vr <- estimGenRel(X=genomes$genos, afs=afs.pop, method="vanraden1")
mrk2chr <- setNames(genomes$snp.coords$chr, rownames(genomes$snp.coords))

Look at some visual checks:

plotHistAllelFreq(afs=afs.pop, main="Allele frequencies")
plotHistMinAllelFreq(mafs=mafs.pop, main="Minor allele frequencies")
## imageWithScale(A.vr, main="Additive genetic relationships")
summary(diag(A.vr))
hist(diag(A.vr), col="grey", border="white")
summary(A.vr[upper.tri(A.vr)])
hist(A.vr[upper.tri(A.vr)], col="grey", border="white")

Bi-parental cross

Set the seed:

set.seed(1859)

Choose two individuals as parents:

(idx.parents <- sample(x=1:nb.genos, size=2, replace=FALSE))
A.vr[idx.parents, idx.parents]
genos.parents <- genomes$genos[idx.parents,]
names.parents <- rownames(genos.parents)
haplos.parents <- getHaplosInds(haplos=genomes$haplos,
                                ind.names=names.parents)

Cross them several times to make offsprings:

nb.offs <- 200
names.offs <- paste0("off-",
                     sprintf(fmt=paste0("%0", floor(log10(nb.offs))+1, "i"),
                             1:nb.offs))
head(crosses.off <- data.frame(parent1=rep(names.parents[1], nb.offs),
                               parent2=rep(names.parents[2], nb.offs),
                               child=names.offs,
                               stringsAsFactors=FALSE))
loc.crovers.off <- drawLocCrossovers(crosses=crosses.off,
                                     nb.snps=sapply(haplos.parents, ncol),
                                     simplistic=FALSE,
                                     verbose=1)
haplos.offs <- makeCrosses(haplos=haplos.parents, crosses=crosses.off,
                           loc.crossovers=loc.crovers.off,
                           howto.start.haplo=0)
genos.offs <- segSites2allDoses(seg.sites=haplos.offs,
                                ind.ids=getIndNamesFromHaplos(haplos.offs),
                                snp.ids=rownames(genomes$snp.coords))
dim(genos.doses <- rbind(genos.parents, genos.offs))
genos.classes <- genoDoses2genoClasses(X=genos.doses,
                                       alleles=genomes$alleles)
genos.jm <- genoClasses2JoinMap(x=genos.classes)
genos.jm[1:3, 1:14]
tests.seg <- filterSegreg(genos.jm[,-c(1:8)], return.counts=TRUE)

The JoinMap format will be used to encode genotypes from both parents and offsprings as it allows to specify segregation types and phases, and can be natively read by the R qtl package.

Plot pedigree:

ped <- data.frame(ind=c(names.parents,
                        crosses.off$child),
                  mother=c(rep(NA, length(names.parents)),
                           crosses.off$parent1),
                  father=c(rep(NA, length(names.parents)),
                           crosses.off$parent2),
                  gen=c(rep(0, length(names.parents)),
                        rep(1, nrow(crosses.off))),
                  stringsAsFactors=FALSE)
ped.tmp <- rbind(ped[1:5,],
                 c(ind="off-...", ped[5, -1]),
                 c(ind="off-....", ped[5, -1]),
                 ped[nrow(ped),])
plotPedigree(inds=ped.tmp$ind, mothers=ped.tmp$mother, fathers=ped.tmp$father,
             generations=ped.tmp$gen, main="Pedigree of the controlled cross")

Check additive genetic relationships:

A.vr.cross <- estimGenRel(X=genos.doses, afs=afs.pop, method="vanraden1")
A.t.cross <- estimGenRel(X=genos.doses, afs=afs.pop, method="toro2011_eq10")
cor(c(A.vr.cross), c(A.t.cross))
## imageWithScale(A.vr.cross, main="Additive genetic relationships of crosses")
## imageWithScale(A.vr.cross[1:10, 1:10],
##                main="Additive genetic relationships of crosses (subset)",
##                idx.rownames=1:10, idx.colnames=1:10)
summary(diag(A.vr.cross))
summary(A.vr.cross[upper.tri(A.vr.cross)])
summary(A.vr.cross[names.parents[1], grep("off", colnames(A.vr.cross))])
summary(A.vr.cross[names.parents[2], grep("off", colnames(A.vr.cross))])
summary(A.t.cross[names.parents[1], grep("off", colnames(A.t.cross))])
summary(A.t.cross[names.parents[2], grep("off", colnames(A.t.cross))])

Under HWE in a single population, the additive genetic relationships between all parent-child pairs should be centered around 0.5, corresponding to a coancestry coefficient of 1/4.

Simulation of phenotypic data

Experimental design

Set the seed:

set.seed(1859)

Structure for a given year

I.p <- 279       # nb of genotypes in the panel
I.c <- 1         # control
I <- I.p + I.c   # total nb of genotypes
stopifnot(I <= nb.genos)
J <- 5           # nb of blocks
nb.controls.per.block <- 30
(I.block <- I.p + I.c * nb.controls.per.block) # total nb of plants in a block
(N <- I.block * J) # nb of phenotypic data per year
uniq.geno.names <- sample(x=rownames(genomes$genos),
                          size=I, replace=FALSE)
uniq.geno.names <- sort(uniq.geno.names)
(control.name <- uniq.geno.names[1])
uniq.geno.p.names <- uniq.geno.names[-1] # panel names, w/o the control
uniq.block.names <- LETTERS[1:J]
block.geno.names <- c(uniq.geno.p.names,
                      rep(control.name, nb.controls.per.block))
stopifnot(length(block.geno.names) == I.block)
dat.year <- data.frame(geno=rep(NA, N),
                       control=FALSE,
                       block=rep(uniq.block.names, each=I.block),
                       rank=NA,
                       location=NA,
                       year=NA,
                       stringsAsFactors=TRUE)
dat.year$geno <- do.call(c, lapply(1:J, function(j){
  sample(x=block.geno.names, size=I.block)
}))
dat.year$geno <- factor(dat.year$geno)
dat.year$control[dat.year$geno == control.name] <- TRUE
str(dat.year)

Field layout

Block layout in the field:

## field view:
## tions  C (F) ...
## ca     B  E  ...
## lo     A  D  ...
##     ranks ->
## matrix view: ranks are columns, locations are rows
## A D
## B E
## C NA
nb.block.rows <- 3
(nb.block.cols <- ceiling(J / nb.block.rows))
lay.blocks <- matrix(data=NA,
                     nrow=nb.block.rows,
                     ncol=nb.block.cols)
block.idx <- 0
for(j in 1:nb.block.cols){
  for(i in 1:nb.block.rows){
    block.idx <- block.idx + 1
    if(i * j > J)
      next
    lay.blocks[i, j] <- uniq.block.names[block.idx]
  }
}
lay.blocks
ranks.per.block <- 20
(locs.per.block <- ceiling(I.block / ranks.per.block))

Use a space-filling design to locate controls in a block (same design in each block):

tmp <- randtoolbox::sobol(n=nb.controls.per.block, dim=2)
## tmp <- DiceDesign::dmaxDesign(n=nb.controls.per.block, dimension=2, range=0.9, niter_max=200)$design
## plot(x=tmp[,1], y=tmp[,2], xlim=c(0,1), ylim=c(0,1))
ranks.ctl <- cut(tmp[,1], breaks=seq(0, 1, length.out=ranks.per.block),
                 labels=FALSE)
locs.ctl <- cut(tmp[,2], breaks=seq(0, 1, length.out=locs.per.block),
                labels=FALSE)
coords.ctl <- data.frame(rank=ranks.ctl,
                         loc=locs.ctl)
coords.ctl <- coords.ctl[order(coords.ctl$rank, coords.ctl$loc),]
rownames(coords.ctl) <- NULL
stopifnot(! anyDuplicated(coords.ctl))
plot(x=ranks.ctl, y=locs.ctl, xlim=c(1,ranks.per.block),
     ylim=c(1,locs.per.block))

Add spatial coordinates of the genotypes in each block:

ctl.space.fill <- TRUE # if FALSE, controls are randomly located
count.blocks <- 0
for(block.j in 1:nb.block.cols){
  for(block.i in 1:nb.block.rows){
    count.blocks <- count.blocks + 1
    if(count.blocks > J)
      break
    block <- lay.blocks[block.i, block.j]
    idx.block <- which(dat.year$block == block)
    rank.first <- 1 + (block.j - 1) * ranks.per.block
    rank.last <- rank.first + ranks.per.block - 1
    loc.first <- 1 + (block.i - 1) * locs.per.block
    loc.last <- loc.first + locs.per.block - 1
    message(paste0("block=", block, " [", block.i, ",", block.j, "]",
                   " ranks=", rank.first, "-", rank.last,
                   " locs=", loc.first, "-", loc.last,
                   " (", idx.block[1], ")"))
    ranks <- rank.first:rank.last
    locs <- loc.first:loc.last
    grid <- expand.grid(list(loc=locs, rank=ranks))
    grid <- grid[1:I.block, c("rank","loc")]

    if(ctl.space.fill){
      idx.ctl.block <- which(dat.year$control & dat.year$block == block)
      ranks.ctl.block <- ranks.ctl + (block.j-1) * ranks.per.block
      locs.ctl.block <- locs.ctl + (block.i-1) * locs.per.block
      stopifnot(all(ranks.ctl.block >= rank.first),
                all(ranks.ctl.block <= rank.last),
                all(locs.ctl.block >= loc.first),
                all(locs.ctl.block <= loc.last))
      dat.year$rank[idx.ctl.block] <- ranks.ctl.block
      dat.year$location[idx.ctl.block] <- locs.ctl.block
      stopifnot(sum(is.na(dat.year$rank[idx.block])) == I.p,
                sum(is.na(dat.year$location[idx.block])) == I.p)
      to.rmv <- c()
      for(i in 1:nrow(grid)){
        idx <- which(ranks.ctl.block == grid$rank[i])
        if(length(idx) > 0){
          if(grid$loc[i] %in% locs.ctl.block[idx])
            to.rmv <- c(to.rmv, i)
        }
      }
      grid <- grid[-to.rmv,]
      stopifnot(nrow(grid) == I.p)
    }

    idx.grid <- 0
    for(idx.dat in which(! dat.year$control & dat.year$block == block)){
      stopifnot(is.na(dat.year$rank[idx.dat]),
                is.na(dat.year$location[idx.dat]))
      idx.grid <- idx.grid + 1
      dat.year$rank[idx.dat] <- grid$rank[idx.grid]
      dat.year$location[idx.dat] <- grid$loc[idx.grid]
    }
  }
}
range(dat.year$rank, na.rm=TRUE)
(nb.ranks <- length(unique(dat.year$rank[! is.na(dat.year$rank)])))
range(dat.year$location, na.rm=TRUE)
(nb.locs <- length(unique(dat.year$location[! is.na(dat.year$location)])))

Plot the field layout:

lay <- SpatialPointsDataFrame(coords=dat.year[, c("rank","location")],
                              data=dat.year[, c("geno","control","block")])
summary(lay)
## lay@data$control <- as.factor(lay@data$control)
## lay@data$block <- as.factor(lay@data$block)
spplot(obj=lay, zcol="block", scales=list(draw=TRUE),
       xlab="ranks", ylab="locations",
       main="Field layout",
       key.space="right", aspect="fill")
spplot(obj=lay, zcol="control",
       col.regions=c("green","red"),
       scales=list(draw=TRUE),
       xlab="ranks", ylab="locations",
       main="Controls",
       key.space="right", aspect="fill")

Structure for the whole data set

Duplicate the data structure per year:

dat <- dat.year
first.year <- 2011
dat$year <- first.year
K <- 2 # nb of years
if(K > 1){
  uniq.year.names <- as.character(first.year:(first.year + K - 1))
  for(year in uniq.year.names[-1]){
    dat <- rbind(dat, dat.year)
    dat$year[is.na(dat$year)] <- year
  }
}
dat$year <- factor(dat$year)
str(dat)
summary(dat)

Add the other regressors:

(N <- I.block * J * K) # nb of phenotypic data for all years
stopifnot(N == nrow(dat))
dat$covar1 <- rnorm(n=N, mean=0, sd=1)
dat$covar2 <- rnorm(n=N, mean=0, sd=1)
dat$fact1 <- sample(x=c("pos","neg"), size=N, replace=TRUE,
                    prob=c(0.7, 1-0.7))
dat$fact1 <- as.factor(dat$fact1)
dat$fact2 <- sample(x=c("pos","neg"), size=N, replace=TRUE,
                    prob=c(0.9, 1-0.9))
dat$fact2 <- as.factor(dat$fact2)
str(dat)

Simulation of trait1

Make design matrices

Fixed effects

Assuming covar2 and fact2 have no effect:

truth.form.fix <- "1 + block + year + covar1 + fact1"
X <- model.matrix(formula(paste0("~ ", truth.form.fix)),
                  data=dat)
dim(X)
Q <- ncol(X)

Genotypic values

Z <- model.matrix(~ -1 + geno, data=dat)
dim(Z)

Spatial heterogeneity

Z.S <- model.matrix(~ -1 + factor(location):factor(rank), data=dat)
dim(Z.S)
colnames(Z.S) <- gsub("factor\\(|\\)", "", colnames(Z.S))
Z.S[1:3, 1:3]

Set input values

To obtain realistic data:

mu <- 100    # intercept
y.min <- 20  # minimal phenotypic value
CV.g <- 0.10 # genotypic coef of variation
h2 <- 0.4    # narrow-sense heritability

Deduce the variances

(sigma.p2 <- ((mu - y.min) / 4)^2) # phenotypic variance
(sigma.g2 <- (CV.g * mu)^2)        # total genetic variance
stopifnot(sigma.g2 < sigma.p2)
sigma.a2 <- sigma.g2               # additive genetic variance
(sigma2 <- ((1 - h2) / h2) * sigma.a2) # error variance
stopifnot(sigma.g2 + sigma2 < sigma.p2)
(sigma.beta2 <- sigma.p2 - (sigma.g2 + sigma2))
(sigma.alpha2 <- sigma.a2 / (2 * sum(afs.pop * (1 - afs.pop)))) # variance of additive SNP effects

Sample effects

Fixed effects

beta <- setNames(c(mu, rnorm(n=Q - 1, mean=0, sd=sqrt(sigma.beta2))),
                 colnames(X))
beta
summary((X %*% beta)[,1])
var((X %*% beta)[,1])

Additive SNP effects

M <- genomes$genos[uniq.geno.names,]
dim(M)
P <- ncol(M)
afs <- estimSnpAf(M) # afs.pop
cor(afs, afs.pop); # plot(afs, afs.pop[names(afs)])
M.a.c <- M - matrix(rep(1, I)) %*% (2 * afs)
alpha <- setNames(rnorm(n=P, mean=0, sd=sqrt(sigma.alpha2)),
                  colnames(M.a.c))
summary(alpha)
hist(alpha)

Update the genetic architecture so that there are few large effects ("major QTLs"):

nbMajorQtls <- 3
mafs <- estimSnpMaf(afs=afs)
chrsWithMajorQtls <- sample(unique(genomes$snp.coords$chr), nbMajorQtls)
majorQtls <- c()
for(i in seq_along(chrsWithMajorQtls)){
  mafsChr <- mafs[rownames(genomes$snp.coords[genomes$snp.coords$chr ==
                                              chrsWithMajorQtls[i],])]
  majorQtls <- c(majorQtls,
                 sample(names(mafs)[mafs > 0.15], 1))
}
alpha[majorQtls] <- 10 * sign(alpha[majorQtls])
hist(alpha)

Dominance SNP effects

boundaries <- seq(from=0, to=2, length.out=4)
is.0 <- (M <= boundaries[2]) # homozygotes for the first allele
is.1 <- (M > boundaries[2] & M <= boundaries[3]) # heterozygotes
is.2 <- (M > boundaries[3]) # homozygotes for the second allele
M.d.c <- matrix(rep(NA, length(M)), nrow=I, ncol=P, dimnames=dimnames(M))
for(i in 1:I){
  M.d.c[i, is.0[i,]] <- - 2 * afs[is.0[i,]]^2
  M.d.c[i, is.1[i,]] <- 2 * afs[is.1[i,]] * (1 - afs[is.1[i,]])
  M.d.c[i, is.2[i,]] <- - 2 * (1 - afs[is.2[i,]])^2
}
delta <- setNames(rep(0, P),
                  colnames(M.d.c))
## delta <- setNames(rnorm(n=P, mean=0, sd=sqrt(sigma.delta2)),
##                   colnames(M.d.c))
summary(delta)

Total genotypic values

a <- M.a.c %*% alpha
d <- M.d.c %*% delta
g <- as.vector(a) + as.vector(d) # ignoring epistasis
names(g) <- rownames(M)
summary(g)
var(g)
hist(g)
for(i in seq_along(majorQtls)){
  print(alpha[majorQtls[i]])
  boxplotCandidateQtl(y=M.a.c %*% alpha, X=M, snp=majorQtls[i], main=majorQtls[i],
                      show.points=TRUE)
}

Genotype-year interactions

TODO

Spatial heterogeneity

Draw many samples

Draw several fields with spatial heterogeneity:

## rho.rank <- rho.loc <- 0
rho.rank <- 0.8
rho.loc <- 0.8
many.etas <- simulAr1Ar1(n=100, R=nb.locs, C=nb.ranks,
                         rho.r=rho.loc, rho.c=rho.rank,
                         sigma.X.2=0,
                         sigma.e.2=sigma2)
dim(many.etas)
stopifnot(dim(many.etas)[1] == nb.locs)  # rows
stopifnot(dim(many.etas)[2] == nb.ranks) # columns

Check that the correlations between ranks and between locations correspond to the input values:

all.cor <- list()
all.cor$loc <- c(apply(many.etas, 3, function(Xi){
  apply(Xi, 2, function(Xi.col){ # per column, hence cor btw rows
    acf(Xi.col, lag.max=1, type="correlation", plot=FALSE)$acf[2]
  })
}))
all.cor$rank <- c(apply(many.etas, 3, function(Xi){
  apply(Xi, 1, function(Xi.row){ # per row, hen cor btw columns
    acf(Xi.row, lag.max=1, type="correlation", plot=FALSE)$acf[2]
  })
}))
sapply(all.cor, length)
sapply(all.cor, summary) # to be compared to rho.loc and rho.rank

Choose one

Choose one sample with which phenotypes will be simulated:

eta.mat <- many.etas[,,1] # extract matrix
dim(eta.mat)
stopifnot(nrow(eta.mat) == nb.locs)
rownames(eta.mat) <- paste0("location", 1:nrow(eta.mat))
stopifnot(ncol(eta.mat) == nb.ranks)
colnames(eta.mat) <- paste0("rank", 1:ncol(eta.mat))
eta.mat[1:3, 1:3]

Check the correlations:

all.cor <- list()
all.cor$loc <- apply(eta.mat, 2, function(eta.col){ # per column, hence cor btw rows
  acf(eta.col, lag.max=1, type="correlation", plot=FALSE)$acf[2]
})
all.cor$rank <- apply(eta.mat, 1, function(eta.row){ # per row, hence cor btw columns
  acf(eta.row, lag.max=1, type="correlation", plot=FALSE)$acf[2]
})
sapply(all.cor, length)
sapply(all.cor, summary) # to be compared to rho.loc and rho.rank

Reformat the matrix into a vector:

eta <- c(eta.mat)         # vectorize it
stopifnot(length(eta) == ncol(Z.S))
summary(eta)
var(Z.S %*% eta)
names(eta) <- paste0(rep(rownames(eta.mat), times=ncol(eta.mat)),
                     ":", rep(colnames(eta.mat), each=nrow(eta.mat)))
stopifnot(all(names(eta) == colnames(Z.S)))
head(eta)

Plot it

coords.sh <- as.matrix(expand.grid(1:nb.locs, 1:nb.ranks))
apply(coords.sh, 2, range)
colnames(coords.sh) <- c("location", "rank")
dim(coords.sh)
head(coords.sh)
eta.df <- data.frame(eta=eta)
head(eta.df)
sh.sp <- SpatialPointsDataFrame(coords=coords.sh, data=eta.df)
spplot(obj=sh.sp, scales=list(draw=TRUE),
       xlab="ranks", ylab="locations",
       main=paste0("Spatial heterogeneity with cor.rank=", rho.rank,
                   ", cor.loc=", rho.loc, " and sigma2=", round(sigma2)),
       key.space="right", aspect="fill")

Fit a model on it

vg.1 <- gstat::variogram(object=eta ~ 1, data=sh.sp)
plot(x=vg.1$dist, y=vg.1$gamma, main="Empirical variogram",
     ylim=c(0, max(vg.1$gamma)))
(vg.1.fit <- suppressWarnings(
     gstat::fit.variogram(object=vg.1,
                          model=vgm(model="Ste", nugget=NA))))
plot(vg.1, vg.1.fit, main="", col="blue",
     key=list(space="top", lines=list(col="blue"),
              text=list("fit of Matérn model")))

Errors

if(all(rho.rank == 0, rho.loc == 0)){
  epsilon <- rnorm(n=N, mean=0, sd=sqrt(sigma2))
} else
  epsilon <- rnorm(n=N, mean=0, sd=0)
summary(epsilon)
var(epsilon)

Phenotypic values

stopifnot(all(colnames(X) == colnames(beta)))
y <- X %*% beta
stopifnot(all(colnames(Z) == colnames(g)))
y <- y + Z %*% g
if(any(rho.rank != 0, rho.loc != 0)){
  stopifnot(all(colnames(Z.S) == colnames(eta)))
  y <- y + Z.S %*% eta
}
y <- y + epsilon
y <- as.vector(y)
summary(y)
var(y)
dat$trait1.init <- y
avg_y <- tapply(dat$trait1.init, dat$geno, mean, na.rm=TRUE)
regplot(g, avg_y[names(g)], pred.int=FALSE)

Missing data

Create missing data depending on the year:

dat$trait1 <- dat$trait1.init
## max.prop.na <- 0
max.prop.na <- 0.1
(prop.na <- setNames(runif(n=K, min=0, max=max.prop.na),
                     uniq.year.names))
for(year in names(prop.na)){
  if(prop.na[year] == 0)
    next
  set.at.na <- sample(x=c(TRUE, FALSE), replace=TRUE,
                      size=sum(dat$year == year),
                      prob=c(prop.na[year], 1 - prop.na[year]))
  if(any(set.at.na)){
    message(paste0(year, ": ", sum(set.at.na), " NAs"))
    dat$trait1[dat$year == year & set.at.na] <- NA
  }
}

Set negative values to NA:

is.neg <- (dat$trait1 <= 0)
if(any(is.neg)){
  idx.neg <- which(is.neg)
  message(length(idx.neg))
  dat$trait1[idx.neg] <- NA
}

Outliers

Create upper outliers:

idx.notNA <- which(! is.na(dat$trait1))
## prop.out <- 0
prop.out <- 0.002
idx.out <- sample(x=idx.notNA, size=floor(prop.out * length(idx.notNA)))
dat$true.outlier <- FALSE
dat$true.outlier[idx.out] <- TRUE
summary(dat$trait1)
(mean.out <- max(dat$trait1, na.rm=TRUE) + median(dat$trait1, na.rm=TRUE))
(sd.out <- 0.5 * sd(dat$trait1, na.rm=TRUE))
dat$trait1[idx.out] <- rnorm(n=length(idx.out),
                             mean=mean.out,
                             sd=sd.out)

Data exploration

Tables

Whole panel

dim(dat)
lapply(setNames(levels(dat$year), levels(dat$year)), function(lev.y){
  do.call(rbind,
          lapply(setNames(levels(dat$block), levels(dat$block)),
                 function(lev.b){
                   betterSummary(subset(dat, year == lev.y & block == lev.b,
                                        select=trait1)$trait1)
                 }))
})

Controls

dat.ctl <- droplevels(dat[dat$geno == control.name,])
dim(dat.ctl)
lapply(setNames(levels(dat.ctl$year), levels(dat.ctl$year)), function(lev.y){
  do.call(rbind,
          lapply(setNames(levels(dat.ctl$block), levels(dat.ctl$block)),
                 function(lev.b){
                   betterSummary(subset(dat.ctl, year == lev.y & block == lev.b,
                                        select=trait1)$trait1)
                 }))
})

Plots

Sources of variation

Outliers:

lower.threshold <- -Inf
upper.threshold <- 1800
x <- "trait1.outlier"
dat[[x]] <- FALSE
is.outlier <- dat[["trait1"]] < lower.threshold |
  dat[["trait1"]] > upper.threshold
sum(is.outlier, na.rm=TRUE)
dat[[x]][is.outlier] <- TRUE
par(mar=c(7, 4, 4, 2) + 0.1)
beanplot(trait1 ~ fact1 + block + year,
         data=dat, log="",
         xlab="", xaxt="n",
         ylab="phenotypes",
         main="Sources of variation",
         side="both", las=1, what=c(1,1,1,0),
         col=list("black","grey"), border=c("black","grey"),
         ylim=c(-5, 1.3 * max(dat$trait1, na.rm=TRUE)))
labels <- paste0(rep(levels(dat$block), nlevels(dat$year)), " in ",
                 rep(levels(dat$year), each=nlevels(dat$block)))
nb.ticks <- nlevels(dat$block) * nlevels(dat$year)
axis(1, at=1:nb.ticks, labels=FALSE)
text(x=1:nb.ticks,
     y=par("usr")[3] - 0.12 * abs(par("usr")[4] - par("usr")[3]),
     srt=35, adj=0.5,
     labels=labels, xpd=TRUE)
mtext(1, text="blocks and years", line=5)
abline(v=seq(from=nlevels(dat$block) + 0.5, to=nb.ticks,
             by=nlevels(dat$block)))
legend("topleft", legend=paste0("fact1: ", levels(dat$fact1)),
       fill=c("black","grey"), border=NA, bty="n")
lgd <- c()
lgd.lty <- c()
lgd.col <- c()
abline(h=c(lower.threshold, upper.threshold), lty=2)
if(any(dat[["trait1.outlier"]])){
  lgd <- c(lgd, "outlier threshold")
  lgd.lty <- c(lgd.lty, 2)
  lgd.col <- c(lgd.col, "black")
}
q1.ctl <- quantile(dat$trait1, probs=0.25, na.rm=TRUE)
q3.ctl <- quantile(dat$trait1, probs=0.75, na.rm=TRUE)
abline(h=c(q1.ctl, q3.ctl), col="red")
lgd <- c(lgd, "controls (Q1, Q3)")
lgd.lty <- c(lgd.lty, 1)
lgd.col <- c(lgd.col, "red")
legend("topright", legend=lgd, lty=lgd.lty, col=lgd.col, bty="n")

Missing data

for(trait in c("trait1")){
  for(year in levels(dat$year)){
    lay.tmp <- lay
    lay.tmp@data[["missing"]] <- "false"
    lay.tmp@data[["missing"]][is.na(dat[dat$year == year, trait])] <- "true"
    lay.tmp@data[["missing"]] <- as.factor(lay.tmp@data[["missing"]])
    print(spplot(obj=lay.tmp, zcol="missing", col.regions=c("green","red"),
                 scales=list(draw=TRUE),
                 xlab="ranks", ylab="locations",
                 main=paste0("Missing data for ", trait, " in ", year),
                 key.space="right", aspect="fill"))
  }
}

First phase of the analysis

Data preparation

str(dat)
trait <- "trait1"
transfo <- "id"
stopifnot(transfo %in% c("id", "log", "sqrt"))
if(any(dat[[paste0(trait, ".outlier")]])){
  dat <- droplevels(dat[! dat[[paste0(trait, ".outlier")]],])
  print(str(dat))
}
if(transfo == "id"){
  response <- trait
} else if(transfo == "log"){
  response <- paste0("log(", trait, ")")
  dat[[response]] <- log(dat[[trait]])
} else if(transfo == "sqrt"){
  response <- paste0("sqrt(", trait, ")")
  dat[[response]] <- sqrt(dat[[trait]])
}

Model fitting, comparison and selection

Approach:

  1. model fit, comparison and selection on control data,

  2. correct spatial heterogeneity via controls,

  3. model fit, comparison and selection on panel data.

If spatial heterogeneity is ignored, only step 3 is performed.

glob.form <- paste0(response, " ~ 1",
                    " + block",
                    " + year",
                    " + covar1",
                    " + covar2",
                    ## " + covar1:covar2",
                    " + fact1",
                    " + fact2",
                    " + fact1:fact2")#,
                    ## " + covar1:fact1",
                    ## " + covar1:fact2",
                    ## " + covar2:fact1",
                    ## " + covar2:fact2")
out <- plantTrialLmmFitCompSel(glob.form=glob.form,
                               dat=dat[dat$control,
                                       c(response,"block","year",
                                         "covar1","covar2","fact1","fact2")],
                               part.comp.sel="fixed",
                               saved.file="two-phase_quantgen_ctl_allmod-sel-ML.RData",
                               nb.cores=nb.cores, verbose=1)

Correct spatial heterogeneity

Kriging using controls:

fix.preds.corr.spat <- out$all.preds
idx <- grep("\\(*\\|*\\)|year", out$all.preds)
if(length(idx) > 0)
  fix.preds.corr.spat <- out$all.preds[-idx]
dat <- correctSpatialHeterogeneity(dat=dat,
                                   response=response,
                                   fix.eff=fix.preds.corr.spat,
                                   verbose=2)

Re-fit the best model using the panel data corrected for spatial heterogeneity:

corr.spat.het <- TRUE
if(corr.spat.het){
  glob.form <- gsub(response, paste0(response, ".csh"), glob.form)
  response <- paste0(response, ".csh")
}
glob.form <- paste0(glob.form,
                    " + (1|geno)",
                    " + (1|geno:year)")
out <- plantTrialLmmFitCompSel(glob.form=glob.form,
                               dat=dat[! dat$control,
                                       c(response,"block","year",
                                         "covar1","covar2","fact1","fact2",
                                         "geno","rank","location")],
                               part.comp.sel=c("fixed","random"),
                               saved.file="two-phase_quantgen_panel_allmod-sel-ML.RData",
                               nb.cores=nb.cores, verbose=1)
dat.noNA <- out$dat.noNA
bestmod.ml <- out$bestmod.ml
bestmod.reml <- out$bestmod.reml

Best model fit to get genotypic BLUEs and EMMs

Re-fit, modeling genotypic values as fixed effects to get their BLUEs:

best.form.as.fix <- paste0(response, " ~ 1 + ",
                           paste(out$best.preds.fix, collapse=" + "),
                           " + ", paste(out$best.preds.rnd, collapse=" + "))
bestmod.lm <- lm(formula=best.form.as.fix,
                 data=dat.noNA,
                 na.action=na.fail)

Re-fit, modeling genotypic values as fixed effects to get their EMMs:

bestmod.emm <- emmeans(bestmod.lm, "geno")

Assumption checking

Residual preparation

fit.all <- cbind(out$dat,
                 response=out$dat[[response]],
                 cond.res=NA,
                 scl.cond.res=NA,
                 fitted=NA)
idx.NA <- attr(dat.noNA, "na.action")
if(length(idx.NA) > 0){
  fit.all$cond.res[- idx.NA] <- residuals(bestmod.reml)
  fit.all$scl.cond.res[- idx.NA] <- residuals(bestmod.reml) /
    sigma(bestmod.reml)
  fit.all$fitted[- idx.NA] <- fitted(bestmod.reml)
} else{
  fit.all$cond.res <- residuals(bestmod.reml)
  fit.all$scl.cond.res <- residuals(bestmod.reml) /
    sigma(bestmod.reml)
  fit.all$fitted <- fitted(bestmod.reml)
}
geno.blups <- ranef(bestmod.reml, condVar=TRUE, drop=TRUE)$geno
geno.var.blups <- setNames(attr(geno.blups, "postVar"),
                           names(geno.blups))

TODO:

## geno.blues
## geno.emmeans

Error homoscedasticity

x.lim <- max(abs(fit.all$scl.cond.res), na.rm=TRUE)
plot(x=fit.all$scl.cond.res, y=fit.all$fitted, las=1,
     xlim=c(-x.lim, x.lim),
     xlab="scaled conditional residuals",
     ylab="fitted responses",
     main=response)
abline(v=c(0, -1.96, 1.96), lty=c(2, 3, 3))
lattice::xyplot(fitted ~ scl.cond.res | year, groups=block,
                data=fit.all,
                xlab="scaled conditional residuals",
                ylab="fitted responses",
                main=response,
                auto.key=list(space="right"),
                panel=function(x,y,...){
                  panel.abline(v=c(0, -1.96, 1.96), lty=c(2, 3, 3))
                  panel.xyplot(x,y,...)
                })
boxplot(scl.cond.res ~ block + year,
        data=fit.all,
        xlab="scaled conditional residuals",
        ## ylab="blocks and years",
        main=response,
        varwidth=TRUE, notch=TRUE, horizontal=TRUE, las=1)
abline(v=c(0, -1.96, 1.96), lty=c(2, 3, 3))

Error normality

shapiro.test(fit.all$scl.cond.res)
qqnorm(y=fit.all$scl.cond.res,
       main=paste0(response, ": scaled conditional residuals"))
qqline(y=fit.all$scl.cond.res, col="red")

Temporal independence of errors

for(block in levels(fit.all$block))
  plotResidualsBtwYears(df=fit.all, colname.res="scl.cond.res",
                        years=c("2011","2012"), blocks=rep(block,2),
                        cols.uniq.id=c("geno","block","rank","location"),
                        main=paste0("Block ", block))
plotResidualsBtwYears(df=fit.all, colname.res="scl.cond.res",
                      years=c("2011","2012"), blocks=c("A","B"),
                      cols.uniq.id="geno",
                      main="Blocks A and B")
plotResidualsBtwYears(df=fit.all, colname.res="scl.cond.res",
                      years=c("2011","2012"), blocks=c("A","C"),
                      cols.uniq.id="geno",
                      main="Blocks A and C")

Spatial independence of errors

lay.tmp <- lay
for(year in levels(fit.all$year)){
  lay.tmp[[paste0("scl.cond.res.", year)]] <- NA
  for(i in 1:nrow(lay.tmp)){
    j <- which(fit.all$block == lay.tmp$block[i] &
               fit.all$rank == lay.tmp$rank[i] &
               fit.all$location == lay.tmp$location[i] &
               fit.all$year == year)
    if(length(j) == 0){
      next
    } else if(length(j) == 1){
      lay.tmp[[paste0("scl.cond.res.", year)]][i] <- fit.all$scl.cond.res[j]
    } else
      warning(paste0("year=", year, "i=", i))
  }
}
for(year in levels(fit.all$year)){
  out <- spplot(obj=lay.tmp, zcol=paste0("scl.cond.res.", year),
                xlab=colnames(coordinates(lay.tmp))[1],
                ylab=colnames(coordinates(lay.tmp))[2],
                main=paste0(response, ": scaled conditional residuals in ", year),
                key.space="right", scales=list(draw=TRUE), aspect="fill")
  print(out)
}

Look at the empirical variogram, and fit a variogram model:

for(year in levels(fit.all$year)){
  tmp <- na.omit(as.data.frame(lay.tmp[,paste0("scl.cond.res.", year)]))
  resid.vg <- variogram(object=formula(paste0("scl.cond.res.",
                                              year, " ~ 1")),
                        locations=~ rank + location, data=tmp)
  ## print(plot(resid.vg,
  ##            main=paste0(response, ": empirical variogram in ", year)))
  resid.vg.fit <- fit.variogram(object=resid.vg,
                                model=vgm(c("Exp", "Sph", "Gau", "Ste")),
                                fit.sills=TRUE,
                                fit.ranges=TRUE,
                                fit.kappa=seq(0.3, 5, 0.05))
  print(resid.vg.fit)
  print(plot(resid.vg, resid.vg.fit, main="", col="blue",
             key=list(space="top", lines=list(col="blue"),
                      text=list(paste0(response, ": fit of variogram model",
                                       " (", resid.vg.fit[2, "model"], ")",
                                       " in ", year)))))
}

Outlying genotypes

x.lim <- max(abs(geno.blups))
par(mar=c(5,6,4,1))
plot(x=geno.blups, y=1:length(geno.blups),
     xlim=c(-x.lim, x.lim),
     xlab="genotypic BLUPs",
     main=response,
     yaxt="n", ylab="")
axis(side=2, at=1:length(geno.blups), labels=names(geno.blups), las=1)
abline(v=0, lty=2)
idx <- which.max(geno.blups)
text(x=geno.blups[idx], y=idx, labels=names(idx), pos=2)
idx <- which.min(geno.blups)
text(x=geno.blups[idx], y=idx, labels=names(idx), pos=4)
summary(geno.var.blups)
head(sort(geno.var.blups))
tail(sort(geno.var.blups))

Normality of genotypic BLUPs

shapiro.test(geno.blups)
qqnorm(y=geno.blups, main=paste0(response, ": genotypic BLUPs"), asp=1)
qqline(y=geno.blups, col="red")

Independence between errors and genotypes

x.lim <- max(abs(fit.all$scl.cond.res))
lattice::dotplot(geno ~ scl.cond.res, data=fit.all,
                 xlim=c(-x.lim, x.lim),
                 xlab="scaled conditional residuals",
                 main=response,
                 panel=function(x,y,...){
                   panel.abline(v=c(0, -1.96, 1.96), lty=c(2,3,3))
                   panel.dotplot(x,y,...)
                 })
lattice::dotplot(geno ~ scl.cond.res | year, groups=block,
                 data=fit.all,
                 auto.key=list(space="right"),
                 xlab="scaled conditional residuals",
                 main=response,
                 panel=function(x,y,...){
                   panel.abline(v=c(0, -1.96, 1.96), lty=c(2,3,3))
                   panel.dotplot(x,y,...)
                 })

Model outputs

Summary

(vc.ml <- as.data.frame(VarCorr(bestmod.ml)))
summary(bestmod.reml)
(vc.reml <- as.data.frame(VarCorr(bestmod.reml)))
if(FALSE){
  summary(bestmod.lm)
  summary(bestmod.emm)
}

Broad-sense heritability

tmp <- estimH2means(dat=dat.noNA, colname.resp=response,
                    colname.trial="year", vc=vc.reml,
                    geno.var.blups=geno.var.blups)
tmp$mean.nb.trials
tmp$mean.nb.reps.per.trial
tmp$H2.classic
tmp$H2.oakey
mySumm <- tmp$sryStat

Confidence intervals

Profiling

if(FALSE){
  st <- system.time(
      prof <- profile(bestmod.ml, signames=FALSE,
                      parallel="multicore", ncpus=nb.cores))
  print(st)
  (ci <- confint(prof, level=0.95))
  xyplot(prof, absVal=TRUE, conf=c(0.8, 0.95), which=1:2,
         main=response)
  densityplot(prof,
              main=response)
  splom(prof, conf=c(0.8, 0.95), which=1:2,
        main=response)
}

Bootstrapping

mySumm(bestmod.reml)
mySumm(bestmod.ml)
if(FALSE){
  st <- system.time(
      fit.boot <- bootMer(x=bestmod.ml, FUN=mySumm,
                          nsim=1*10^3, seed=1859,
                          use.u=FALSE, type="parametric",
                          parallel="multicore", ncpus=nb.cores))
  print(st)
  print(fit.boot)
  for(i in seq_along(fit.boot$t0)){
    message(names(fit.boot$t0)[i])
    plot(fit.boot, index=i,
         main=paste0(response, ": ", names(fit.boot$t0)[i]))
    print(boot.ci(fit.boot, conf=c(0.8, 0.95),
                  type=c("norm", "basic", "perc"),
                  index=i))
  }
}

Hypothesis testing

Fixed effects

anova(bestmod.reml)
## anova(bestmod.reml.lmerTest, ddf="lme4")
## anova(bestmod.reml.lmerTest, ddf="Satterthwaite")
## system.time(
##     print(anova(bestmod.reml.lmerTest, ddf="Kenward-Roger")))

Variance components

ranova(bestmod.reml)

In-sample prediction

cor(fit.all$fitted, fit.all$response, use="complete.obs")
plot(fit.all$fitted, fit.all$response, las=1, asp=1,
     xlab="observed response",
     ylab="fitted responses",
     main=response)
abline(a=0, b=1, lty=2)
abline(v=mean(fit.all$fitted, na.rm=TRUE), lty=2)
abline(h=mean(fit.all$response, na.rm=TRUE), lty=2)
tapply(1:nrow(fit.all), fit.all$year, function(idx){
  cor(fit.all$fitted[idx], fit.all$response[idx], use="complete.obs")
})
tapply(1:nrow(fit.all), list(fit.all$year, fit.all$block), function(idx){
  cor(fit.all$fitted[idx], fit.all$response[idx], use="complete.obs")
})
lattice::xyplot(response ~ fitted | year, groups=block,
                data=fit.all,
                auto.key=list(space="right"),
                xlab="observed response",
                ylab="fitted responses",
                main=response,
                panel=function(x,y,...){
                  panel.abline(a=0, b=1, lty=2)
                  panel.abline(v=mean(x, na.rm=TRUE), lty=2)
                  panel.abline(h=mean(y, na.rm=TRUE), lty=2)
                  panel.xyplot(x,y,...)
                })

Comparison with true values

regplot(x=geno.blups, y=g[names(geno.blups)],
        xlab="eBLUPs of genotypic values",
        ylab="true genotypic values",
        main="Simulated data")

Second phase of the analysis

Reformatting

matSnps <- M[names(geno.blups),]
afs <- estimSnpAf(matSnps) # afs.pop
cor(afs, afs.pop); # plot(afs, afs.pop[names(afs)])
matAdd <- estimGenRel(X=matSnps, afs=afs, method="vanraden1")
matSnps_ok <- matSnps[, estimSnpMaf(afs=afs) > 0.05]
ncol(matSnps_ok)
snp.coords_ok <- genomes$snp.coords[colnames(matSnps_ok),]
afs.ok <- afs[colnames(matSnps_ok)]
matSnps_ok_ctr <- matSnps_ok - matrix(rep(1,nrow(matSnps_ok))) %*% (2 * afs.ok)
head(matSnps_ok_ctr[,majorQtls])
head(M.a.c[rownames(matSnps_ok_ctr),majorQtls])

univariate, SNP-by-SNP (MM4LMM)

Fit

p2f <- "two-phase_quantgen_MM4LMM.rds"
if(! file.exists(p2f)){
  st <- system.time(
      ## out_mm4lmm_1 <- MMEst(Y=g[names(geno.blups)], X=matSnps_ok_ctr,
      out_mm4lmm_1 <- MMEst(Y=geno.blups, X=matSnps_ok,
                            VarList=list(Additive=matAdd,
                                         Error=diag(length(geno.blups)))))
  print(st)
  saveRDS(out_mm4lmm_1, p2f)
  print(tools::md5sum(path.expand(p2f)))
} else{
  print(tools::md5sum(path.expand(p2f)))
  load(p2f)
}
out_mm4lmm_2 <- AnovaTest(out_mm4lmm_1, Type="TypeI")

Reformat

out_mm4lmm_2[[1]]
(snpError <- which(sapply(out_mm4lmm_2, function(x){
  if(! "Xeffect" %in% rownames(x)) TRUE else FALSE
})))
pvals_mm4lmm <- sapply(out_mm4lmm_2, function(x){
  if("Xeffect" %in% rownames(x))  x["Xeffect","pval"] else NA
})
pvals <- cbind(snp.coords_ok[names(pvals_mm4lmm),], pvals_mm4lmm)
colnames(pvals) <- c("Chrom", "Position", "p.val")
pvals$p.val <- -log10(pvals$p.val)

Plot

mat <- matrix(c(1,1,2,3), nrow=2, ncol=2, byrow=T)
layout(mat, widths = c(4,4), heights = c(3,3))

sommer::manhattan(pvals, pch=20, cex=1)

plotHistPval(10^-pvals$p.val, main="", ylim=c(0,1.5), legend.x=NULL)

col <- setNames(rep("black", nrow(pvals)), rownames(pvals))
col[majorQtls] <- "red"
pch <- setNames(rep(1, nrow(pvals)), rownames(pvals))
pch[majorQtls] <- 20
tmp <- qqplotPval(10^-pvals$p.val, main="", col=col, pch=pch,
                  ctl.fdr.bh=TRUE, verbose=0)
lim.pv.bh <- attr(tmp, "lim.pv.bh")
segments(x0=par("usr")[1], y0=-log10(lim.pv.bh),
         x1=-log10(lim.pv.bh), y1=-log10(lim.pv.bh),
         lty=3)

10^-pvals[majorQtls,"p.val"]

univariate, SNPs jointly (MLMMGWAS)

TODO

uniavriate, SNPs jointly (varbvs)

TODO

multivariate, SNP-by-SNP

TODO

multivariate, SNPs jointly

TODO

Appendix

t1 <- proc.time(); t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Feb. 7, 2024, 8:17 a.m.