R.v.maj <- as.numeric(R.version$major)
R.v.min.1 <- as.numeric(strsplit(R.version$minor, "\\.")[[1]][1])
if(R.v.maj < 2 || (R.v.maj == 2 && R.v.min.1 < 15))
  stop("requires R >= 2.15", call.=FALSE)

suppressPackageStartupMessages(library(knitr))
opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE, fig.align="center")
opts_knit$set(progress=TRUE, verbose=TRUE)

Overview

This document requires external packages:

suppressPackageStartupMessages(library(scrm))
suppressPackageStartupMessages(library(qqman))
suppressPackageStartupMessages(library(qvalue))
suppressPackageStartupMessages(library(ashr))
suppressPackageStartupMessages(library(mlmm))
suppressPackageStartupMessages(library(mlmm.gwas))
suppressPackageStartupMessages(library(varbvs))
stopifnot(compareVersion("2.5",
                         as.character(packageVersion("varbvs")))
          != 1)
suppressPackageStartupMessages(library(coda))
suppressPackageStartupMessages(library(BGLR))
suppressPackageStartupMessages(library(rgs3))
stopifnot(compareVersion("0.5.5",
                         as.character(packageVersion("rgs3")))
          != 1)
suppressPackageStartupMessages(library(glmnet))
suppressPackageStartupMessages(library(rutilstimflutre))
stopifnot(compareVersion("0.160.0",
                         as.character(packageVersion("rutilstimflutre")))
          != 1)

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

Simulate some data

Genotypes (bi-allelic SNPs)

set.seed(1859)
nb.genos <- 200
Ne <- 10^4
nb.chroms <- 10
chrom.len <- 10^5
mu <- 10^(-8)
c.rec <- 10^(-7)
system.time(
    genomes <- simulCoalescent(nb.inds=nb.genos,
                               nb.reps=nb.chroms,
                               pop.mut.rate=4 * Ne * mu * chrom.len,
                               pop.recomb.rate=4 * Ne * c.rec * chrom.len,
                               chrom.len=chrom.len,
                               get.alleles=TRUE))
afs <- estimSnpAf(X=genomes$genos)
plotHistAllelFreq(afs=afs)
X <- discardSnpsLowMaf(X=genomes$genos, thresh=0.01)
afs <- afs[colnames(X)]
A.vr <- estimGenRel(X=X, relationships="additive", method="vanraden1")
A.cs <- estimGenRel(X=X, relationships="additive", method="center-std")
table(genomes$snp.coords$chr)
cn2i <- chromNames2integers(x=genomes$snp.coords$chr,
                            prefix="chr")

Phenotypes (single trait, no dominance, no weight)

set.seed(1859)
true.pi <- 0.01
true.pve <- 0.7
true.sigma.a2 <- 1
phenos <- simulBvsr(Q=1, X=X, pi=true.pi, pve=true.pve,
                    sigma.a2=true.sigma.a2, min.maf=0.01)
stopifnot(colnames(phenos$X.A) == colnames(X))
stopifnot(names(phenos$a) == colnames(phenos$X.A))
stopifnot(names(phenos$gamma) == names(phenos$a))
true.qtls <- names(phenos$gamma[phenos$gamma != 0])
summary(abs(phenos$a[true.qtls]))
length(true.qtls)
tmp <- data.frame(chr=genomes$snp.coords[true.qtls, "chr"],
                  pos=genomes$snp.coords[true.qtls, "pos"],
                  a=phenos$a[true.qtls])
(tmp <- tmp[order(abs(tmp$a), decreasing=TRUE),])

Explore the data

hist(phenos$Y[,1], breaks="FD",
     xlab="phenotypic values", main="Simulated data",
     col="grey", border="white", las=1)

TODO: plot LD around causal SNPs

Perform inference SNP-by-SNP with GEMMA

Ref: Zhou and Stephens (2012)

Fit

snp.coords <- data.frame(coord=genomes$snp.coords$pos,
                         chr=cn2i$renamed,
                         row.names=rownames(genomes$snp.coords))
system.time(
    fit.gemma <- gemma(model="ulmm", y=phenos$Y[,1], X=X,
                       maf=0.01, recode.genos=FALSE,
                       snp.coords=snp.coords,
                       alleles=genomes$alleles,
                       W=phenos$W, weights=NULL, clean="all"))
stopifnot(all(names(phenos$a) == rownames(fit.gemma$tests)))
system.time(
    fit.gemma.chr <- gemmaUlmmPerChr(y=phenos$Y[,1], X=X,
                                     maf=0.01, recode.genos=FALSE,
                                     snp.coords=snp.coords,
                                     alleles=genomes$alleles,
                                     W=phenos$W, clean="all"))
stopifnot(all(names(phenos$a) == rownames(fit.gemma.chr)))

Look at the Manhattan plot:

tmp <- data.frame(BP=fit.gemma.chr$ps,
                  CHR=fit.gemma.chr$chr,
                  P=fit.gemma.chr$p_wald,
                  SNP=rownames(fit.gemma.chr))
manhattan(x=tmp,
          chrlabs=unique(cn2i$original[order(cn2i$renamed)]),
          suggestiveline=FALSE,
          genomewideline=-log10(0.05 / nrow(fit.gemma.chr)),
          highlight=true.qtls, main="GEMMA per chr")
legend("topright", legend="Bonferroni threshold", col="red", lty=1, bty="n")

Results concerning the QTL percentage

plotHistPval(pvalues=fit.gemma$tests$p_wald,
             main="GEMMA (all chromosomes)")
plotHistPval(pvalues=fit.gemma.chr$p_wald,
             main="GEMMA (per chromosome)")
cols <- setNames(rep("black", ncol(X)), colnames(X))
cols[true.qtls] <- "red"
pvadj <- qqplotPval(pvalues=setNames(fit.gemma$tests$p_wald,
                                     rownames(fit.gemma$tests)),
                    thresh=0.05,
                    ctl.fwer.bonf=TRUE, ctl.fdr.bh=TRUE, ctl.fdr.storey=TRUE,
                    plot.signif=TRUE, col=cols,
                    main="GEMMA (all chromosomes)")
1 - qvalue(fit.gemma$tests$p_wald, fdr.level=0.05)$pi0
pvadj.chr <- qqplotPval(pvalues=setNames(fit.gemma.chr$p_wald,
                                         rownames(fit.gemma.chr)),
                        thresh=0.05,
                        ctl.fwer.bonf=TRUE, ctl.fdr.bh=TRUE, ctl.fdr.storey=TRUE,
                        plot.signif=TRUE, col=cols,
                        main="GEMMA (per chromosome)")
1 - qvalue(fit.gemma.chr$p_wald, fdr.level=0.05)$pi0

Try ashr:

x <- fit.gemma.chr$beta
h <- hist(x, breaks="FD")
seq.x <- seq(min(x), max(x), length=60) 
dnorm.y <- dnorm(seq.x, mean=mean(x), sd=sd(x)) 
dnorm.y <- dnorm.y * diff(h$mids[1:2]) * length(x)
lines(seq.x, dnorm.y, col="red", lwd=2)
## see ?MASS::fitdistr (example) to fit a "t" dist
fit.ash <- ash(betahat=fit.gemma.chr$beta,
               sebetahat=fit.gemma.chr$se,
               df=NULL) # alpha=0 by default
head(fit.ash$result)
plot(fit.gemma.chr$beta,
     fit.ash$result$PosteriorMean,
     col=cols, asp=1)
abline(h=0, v=0, a=0, b=1, lty=2)
plot(phenos$a,
     fit.ash$result$PosteriorMean,
     col=cols, asp=1)
abline(h=0, v=0, a=0, b=1, lty=2)

Results concerning non-null effects

tmp <- setNames(rep(TRUE, ncol(X)), names(phenos$gamma))
tmp[rownames(pvadj[pvadj$pv.bh > 0.05,])] <- FALSE
t(binaryClassif(known.nulls=phenos$gamma == 0,
                called.nulls=tmp))
tmp <- setNames(rep(TRUE, ncol(X)), names(phenos$gamma))
tmp[rownames(pvadj.chr[pvadj.chr$pv.bh > 0.05,])] <- FALSE
t(binaryClassif(known.nulls=phenos$gamma == 0,
                called.nulls=tmp))

Results concerning effect magnitudes

cor(phenos$a[true.qtls],
    fit.gemma$tests[true.qtls, "beta"])
cor(phenos$a[true.qtls],
    fit.gemma.chr[true.qtls, "beta"])
plot(x=phenos$a[true.qtls],
     y=fit.gemma$tests[true.qtls, "beta"], asp=1,
     xlab="true", ylab="estimated", main="SNP effects (GEMMA all chrs)")
abline(h=0, v=0, a=0, b=1, lty=2)
plot(x=phenos$a[true.qtls],
     y=fit.gemma.chr[true.qtls, "beta"], asp=1,
     xlab="true", ylab="estimated", main="SNP effects (GEMMA per chr)")
abline(h=0, v=0, a=0, b=1, lty=2)

Perform inference SNP-by-SNP with BLMM

Ref: Wen (2015)

Fit

It uses the approximate Bayes factor from Wakefield, which requires summary statistics, e.g. from GEMMA:

system.time(
    fit.blmm <-
      calcAsymptoticBayesFactorWakefield(
          theta.hat=setNames(fit.gemma.chr$beta, rownames(fit.gemma.chr)),
          V=setNames(fit.gemma.chr$se, rownames(fit.gemma.chr)),
          W=c(0.1, 0.2, 0.4, 0.8, 1.6),
          log10=TRUE))
stopifnot(names(fit.blmm) == colnames(X))

Results concerning the QTL percentage

(pi0.hat <- estimatePi0WithEbf(log10.bfs=fit.blmm))

Results concerning non-null effects

signif <- controlBayesFdr(log10.bfs=fit.blmm, pi0=pi0.hat)
stopifnot(names(signif) == colnames(X))
t(binaryClassif(known.nulls=phenos$gamma == 0,
                called.nulls=! signif))

Results concerning effect magnitudes

TODO: compute posterior means

Perform inference all SNPs jointly with MLMM

Ref: Segura et al (2012)

Fit

system.time(
    fit.mlmm <- mlmm(Y=phenos$Y[,1], X=phenos$X.A, K=A.cs,
                     nbchunks=2, maxsteps=20))
plot_step_RSS(fit.mlmm)
snp.info <- cbind(colnames(X), genomes$snp.coords[colnames(X),])
colnames(snp.info) <- c("SNP", "Chr", "Pos")
c2i <- chromNames2integers(snp.info$Chr)
snp.info$Chr <- c2i$renamed
plot_opt_GWAS(fit.mlmm, opt="extBIC", snp_info=snp.info, pval_filt=0.1,
              main="optimal (EBIC)")

Look at the Manhattan plot:

tmp <- data.frame(P=fit.mlmm$pval_step[[1]]$out$pval,
                  SNP=rownames(fit.mlmm$pval_step[[1]]$out),
                  stringsAsFactors=FALSE)
tmp$CHR <- snp.coords[tmp$SNP, "chr"]
tmp$BP <- snp.coords[tmp$SNP, "coord"]
manhattan(x=tmp,
          chrlabs=unique(cn2i$original[order(cn2i$renamed)]),
          suggestiveline=FALSE,
          genomewideline=-log10(0.05 / nrow(fit.mlmm$pval_step[[1]]$out)),
          highlight=true.qtls, main="MLMM")
legend("topright", legend="Bonferroni threshold", col="red", lty=1, bty="n")

Results concerning non-null effects

tmp <- setNames(rep(TRUE, ncol(X)), names(phenos$gamma))
tmp[fit.mlmm$opt_extBIC$cof] <- FALSE
t(binaryClassif(known.nulls=phenos$gamma == 0,
                called.nulls=tmp))

Results concerning effect magnitudes

tmp <- setNames(rep(0, length(true.qtls)), true.qtls)
tmp[fit.mlmm$opt_extBIC$cof] <- fit.mlmm$opt_extBIC$coef[-1, "Estimate"]
cor(phenos$a[true.qtls], tmp)
plot(x=phenos$a[true.qtls],
     y=tmp, asp=1,
     xlab="true", ylab="estimated", main="SNP effects (MLMM)")
abline(h=0, v=0, a=0, b=1, lty=2)

Perform inference all SNPs jointly with MLMM-GWAS

Ref: Bonnafous et al (2017)

Fit

Xa <- scale(phenos$X.A, center=TRUE, scale=FALSE)
K.add <- Xa %*% t(Xa)
system.time(
    fit.mlmmgwas <- mlmm_allmodels(Y=phenos$Y[,1],
                                   XX=list(Xa), KK=list(K.add),
                                   nbchunks=2, maxsteps=20))
manhattan.plot(fit.mlmmgwas)
sel.XX <- frommlmm_toebic(list(Xa), fit.mlmmgwas)
system.time(
    res.eBIC <- eBIC_allmodels(phenos$Y[,1], sel.XX, list(K.add), ncol(Xa)))
res.eBIC
sel.XXclass <- fromeBICtoEstimation(sel.XX, res.eBIC)
effects <- Estimation_allmodels(phenos$Y[,1], sel.XXclass, list(K.add))
effects
genotypes.boxplot(Xa, phenos$Y[,1], rownames(res.eBIC)[2], effects,
                  c("0","1","2"), xlab="genotypic classes",
                  las=1, ylab="phenotypic values")

Perform inference all SNPs jointly with varbvs

Ref: Carbonetto and Stephens (2012)

Fit

system.time(
    fit.varbvs <- varbvs(X=phenos$X.A, Z=NULL, y=phenos$Y[,1],
                         weights=NULL, verbose=FALSE))
print(fit.varbvs.s <- summary(fit.varbvs))
subset.snps <- unique(c(as.character(fit.varbvs.s$top.vars$variable),
                        true.qtls))
subset.coords <- genomes$snp.coords[subset.snps,]
(subset.coords <- subset.coords[order(subset.coords$chr, rownames(subset.coords)),])
ld <- estimLd(X=X[, rownames(subset.coords)], snp.coords=subset.coords)
ld[ld$loc1 == as.character(fit.varbvs.s$top.vars$variable)[1],]

Results concerning the QTL percentage

(pi.hat <- 10^(fit.varbvs.s$logodds$x0) / (1 + 10^(fit.varbvs.s$logodds$x0)))
(pi.hat.low <- 10^(fit.varbvs.s$logodds$a) / (1 + 10^(fit.varbvs.s$logodds$a)))
(pi.hat.high <- 10^(fit.varbvs.s$logodds$b) / (1 + 10^(fit.varbvs.s$logodds$b)))

Results concerning non-null effects

w <- c(normalizelogweights(fit.varbvs$logw))
pips <- c(fit.varbvs$alpha %*% w)
cols <- rep("black", ncol(phenos$X.A))
cols[phenos$gamma != 0] <- "red"
plot(x=1:ncol(phenos$X.A), y=pips, col=cols, las=1, xlab="SNPs", ylab="PIP",
     main="Posterior inclusion probabilities (varbvs)")

Results concerning effect magnitudes

TODO

Perform inference all SNPs jointly with BGLR

Ref: de los Campo et al (2009)

Fit

Run BGLR:

task.id <- "test-BGLR_"
nb.iters <- 50 * 10^3
burn.in <- 5 * 10^3
thin <- 5
system.time(
    fit.bglr <- BGLR(y=phenos$Y[,1],
                     response_type="gaussian",
                     ETA=list(list(X=X, model="BayesC", saveEffects=TRUE)),
                     weights=NULL,
                     groups=NULL,
                     nIter=nb.iters, burnIn=burn.in, thin=thin,
                     saveAt=task.id,
                     verbose=FALSE))

Assess convergence (caution, BGLR is inconsistent in its use of burnIn when saving samples):

post.samples <- cbind(mu=read.table(paste0(task.id, "mu.dat"))[,1],
                      varE=read.table(paste0(task.id, "varE.dat"))[,1],
                      pi=read.table(paste0(task.id, "ETA_1_parBayesC.dat"))[,1],
                      varB=read.table(paste0(task.id, "ETA_1_parBayesC.dat"))[,2])
post.samples <- mcmc.list(mcmc(post.samples, start=thin,
                               end=nb.iters, thin=thin))
post.samples <- window(post.samples, start=burn.in+1)
plot(post.samples)
raftery.diag(post.samples, q=0.5, r=0.05, s=0.9)
geweke.diag(post.samples) # should not be too far from [-2;2]
effectiveSize(post.samples)
summary(post.samples)
post.effects <- readBinMat(paste0(task.id, "ETA_1_b.bin"))
dim(post.effects)
colnames(post.effects) <- colnames(X)
post.effects <- mcmc.list(mcmc(post.effects, start=thin+burn.in,
                               end=nb.iters, thin=thin))
plot(post.effects[,true.qtls[1:4]])
summary(post.effects[,true.qtls])

Look at the proportion of variance explained by breeding values:

pve <- rep(NA, (nb.iters - burn.in) / thin)
for(i in seq_along(pve)){
  var.gen.add <- var(X %*% post.effects[[1]][i,])
  pve[i] <- var.gen.add / (var.gen.add + post.samples[[1]][i,"varE"])
}
pve <- mcmc.list(mcmc(pve, start=thin+burn.in,
                      end=nb.iters, thin=thin))
summary(pve)
plot(pve)

Results concerning the QTL percentage

summary(post.samples[,"pi"])
HPDinterval(post.samples[,"pi"])

Results concerning non-null effects

post.snps <- fit.bglr$ETA[[1]]$d
names(post.snps) <- colnames(X)
summary(post.snps)
cols <- rep("black", ncol(X))
cols[phenos$gamma != 0] <- "red"
plot(x=1:ncol(X), y=post.snps[names(phenos$gamma)],
     col=cols, las=1, xlab="SNPs", ylab="PIP",
     main="Posterior inclusion probabilities (BGLR with BayesC)")

Results concerning effect magnitudes

idx <- match(true.qtls, fit.bglr$ETA[[1]]$colNames)
tmp <- cbind(true=phenos$a[true.qtls],
             pip=fit.bglr$ETA[[1]]$d[idx],
             estim=fit.bglr$ETA[[1]]$b[idx],
             se=fit.bglr$ETA[[1]]$SD.b[idx])
tmp[order(abs(tmp[,1]), decreasing=TRUE),]
post.snps <- fit.bglr$ETA[[1]]$b
names(post.snps) <- colnames(X)
cor(phenos$a[true.qtls], post.snps[true.qtls])
plot(x=phenos$a[true.qtls],
     y=post.snps[true.qtls], asp=1,
     xlab="true", ylab="estimated", main="SNP effects (BGLR with BayesC)")
abline(h=0, v=0, a=0, b=1, lty=2)

Clean

for(f in c(paste0(task.id, c("mu.dat", "varE.dat", "ETA_1_parBayesC.dat",
                             "ETA_1_b.bin"))))
  if(file.exists(f))
    file.remove(f)

Perform inference all SNPs jointly with GS3

Ref: Legarra et al (2014)

Fit

Set up the configuration:

task.id <- "test"
dat <- data.frame(geno.id=rownames(phenos$Y),
                  overall.mean=1,
                  pheno=phenos$Y[,1])#,
                  ## weight=NA)
ptl <- data.frame(position=c(2,
                             ncol(dat) + 1),
                             ## ncol(dat) + 1),
                  type=c("cross",
                         "add_SNP"),
                         ## "dom_SNP"),
                  nlevels=c(1,
                            0))
                            ## 0))
config <- getDefaultConfig(
    nb.snps=ncol(X),
    rec.id=which(colnames(dat) == "geno.id"),
    twc=c(which(colnames(dat) == "pheno"),
          0),#which(colnames(dat) == "weight")),
    method="VCE",
    ptl=ptl,
    use.mix="T",
    task.id=task.id)
config$niter <- 30 * 10^3
config$burnin <- 15 * 10^3
config$thin <- 2
config$ap
getMeanVarBetaDist(1, 10)
stopifnot(isValidConfig(config))

Prepare the input files:

inds <- setNames(object=1:nlevels(dat$geno.id),
                 nm=levels(dat$geno.id))
data.GS3.file <- paste0(task.id, "_data.tsv")
config$data.file <- data.GS3.file
writeDataForGs3(x=dat, file=data.GS3.file, inds=inds,
                col.id=which(colnames(dat) == "geno.id"),
                col.traits=which(colnames(dat) == "pheno"))
genos.GS3.file <- paste0(task.id, "_genos.tsv")
config$genos.file <- genos.GS3.file
writeGenosForGs3(x=X, file=genos.GS3.file, inds=inds)
config.file <- writeConfigForGs3(config=config,
                                 task.id=task.id)

Run GS3:

system.time(
    stdouterr.GS3.file <- execGs3(config.file, task.id))

Assess convergence:

vcs <- vcs2mcmc(config, afs)
plot(vcs[, grep("^vara$|vare|pa_1", colnames(vcs[[1]]))])
raftery.diag(vcs, q=0.5, r=0.05, s=0.9)
geweke.diag(vcs) # should not be too far from [-2;2]
effectiveSize(vcs)
summary(vcs)

Look at the proportion of variance explained by breeding values:

plot(vcs[, grep("h2", colnames(vcs[[1]]))], main="h2")

Results concerning the QTL percentage

summary(vcs[,"pa_1"])
HPDinterval(vcs[,"pa_1"])

Results concerning non-null effects

sols <- read.table(file=config$sol.file, header=TRUE)
table(sols$effect)
sols$solution[sols$effect == 1]
post.snps <- sols[sols$effect == 2,]
rownames(post.snps) <- colnames(X)
summary(post.snps$p)
cols <- rep("black", ncol(X))
cols[phenos$gamma != 0] <- "red"
plot(x=1:ncol(X), y=post.snps[names(phenos$gamma), "p"],
     col=cols, las=1, xlab="SNPs", ylab="PIP",
     main="Posterior inclusion probabilities (GS3 with BayesCPi)")

Results concerning effect magnitudes

summary(post.snps$solution)
tmp <- cbind(phenos$a[true.qtls],
             post.snps[true.qtls, c("solution", "sderror", "p")])
tmp[order(abs(tmp[,1]), decreasing=TRUE),]
cor(phenos$a[true.qtls],
    post.snps[true.qtls, "solution"])
plot(x=phenos$a[true.qtls],
     y=post.snps[true.qtls, "solution"], asp=1,
     xlab="true", ylab="estimated", main="SNP effects (GS3 with BayesCPi)")
abline(h=0, v=0, a=0, b=1, lty=2)

Clean

cleanGs3(config, config.file, task.id)
file.remove(data.GS3.file)
file.remove(genos.GS3.file)

Perform inference all SNPs jointly with glmnet

Ref: Friedman et al (2010)

TODO

Appendix

t1 <- proc.time(); t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Feb. 7, 2024, 8:17 a.m.