R.v.maj <- as.numeric(R.version$major)
R.v.min.1 <- as.numeric(strsplit(R.version$minor, "\\.")[[1]][1])
if(R.v.maj < 2 || (R.v.maj == 2 && R.v.min.1 < 15))
  stop("requires R >= 2.15", call.=FALSE)

suppressPackageStartupMessages(library(knitr))
opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE, fig.align="center")

Overview

This document requires external packages to be available:

suppressPackageStartupMessages(library(scrm))
suppressPackageStartupMessages(library(VariantAnnotation))
suppressPackageStartupMessages(library(rutilstimflutre))
stopifnot(compareVersion("0.161.0",
                         as.character(packageVersion("rutilstimflutre")))
          != 1)
stopifnot(file.exists(Sys.which("beagle.jar")))
suppressPackageStartupMessages(library(parallel))
nb.cores <- detectCores() - 1

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

Simulate genetic data at HWE

Set the seed:

set.seed(1859)

Simulate haplotypes and genotypes in a single population:

nb.genos <- 1*10^3
nb.chrs <- 5
Ne <- 10^4
chrom.len <- 1*10^6
mu <- 10^(-8)
c.rec <- 10^(-8)
genomes <- simulCoalescent(nb.inds=nb.genos,
                           nb.reps=nb.chrs,
                           pop.mut.rate=4 * Ne * mu * chrom.len,
                           pop.recomb.rate=4 * Ne * c.rec * chrom.len,
                           chrom.len=chrom.len,
                           get.alleles=TRUE)
afs.pop <- estimSnpAf(X=genomes$genos)
plotHistAllelFreq(afs=afs.pop,
                  main="Allele frequencies at HWE")
mafs.pop <- estimSnpMaf(afs=afs.pop)
plotHistMinAllelFreq(mafs=mafs.pop,
                     main="Minor allele frequencies at HWE")
A.vr.pop <- estimGenRel(X=genomes$genos, afs=afs.pop, method="vanraden1")
imageWithScale(A.vr.pop,
               main="Additive genetic relationships at HWE")
summary(diag(A.vr.pop))
hist(diag(A.vr.pop), col="grey", border="white")
summary(A.vr.pop[upper.tri(A.vr.pop)])
hist(A.vr.pop[upper.tri(A.vr.pop)], col="grey", border="white")

Mimick RAD-seq

Discard SNP genotypes according to a "RAD-seq" design, i.e. all genotypes and SNPs have NA, but with a skewed distribution (30% of the sites with 0-0.05% NA, 17% of the sites with 0.05-0.1% NA, 8% with 0.1-0.15% NA, and the remaining sites with a uniformly-increasing amount of NA):

genos.tokeep <- sample(rownames(genomes$genos), size=300)
dim(X <- genomes$genos[genos.tokeep,])
X.na <- X
bin.width <- 0.05
max.prop <- 0.9
bin.mids <- seq(from=bin.width/2, to=max.prop, by=bin.width)
bin.heights <- c(0.3, 0.17, 0.08,
                 rep((1-(0.3+0.17+0.08))/(length(bin.mids) - 3),
                     length(bin.mids) - 3))
stopifnot(sum(bin.heights) == 1)
assigned.bins <- cut(x=1:ncol(X),
                     breaks=floor(ncol(X) * cumsum(c(0, bin.heights))),
                     labels=FALSE)
assigned.bins <- sample(assigned.bins) # shuffle the bins among SNPs
table(assigned.bins)
bin.probs <- seq(from=0, to=max.prop, length.out=length(bin.mids))
## for each SNP, sample genotype indices to set as missing
idx1 <- sapply(1:ncol(X), function(j){
  sample.int(n=nrow(X), size=round(bin.probs[assigned.bins[j]] * nrow(X)))
})
idx2 <- do.call(c, lapply(1:length(idx1), function(j){
  ((j-1) * nrow(X)) + idx1[[j]]
}))
X.na[idx2] <- NA

Plot grid of missing SNP genotypes:

plotGridMissGenos(X=X.na)

Plot histogram of NA proportions, similar to figure S4 of Chan et al (2016)):

miss.snps <- calcFreqMissSnpGenosPerSnp(X=X.na)
summary(miss.snps)
hist(miss.snps, breaks=seq(0,1,0.05), col="grey", border="white",
     main="Distribution of the proportion of missing data per biallelic SNP",
     xlab="proportion of missing data at a given SNP",
     ylab="number of SNPs")

Stats on the mask of NA's:

mask.na <- is.na(X.na)
sum(mask.na) # number of NA's
sum(mask.na) / length(mask.na) # percentage of NA's
perc.NAs.snp <- apply(mask.na, 2, function(mask.snp){
  100 * sum(mask.snp) / length(mask.snp)
})
smoothScatter(x=mafs.pop, y=perc.NAs.snp, ylim=c(0,100),
              xlab="minor allele frequencies",
              ylab="%NA", las=1,
              main="Percentage of NA's as a function of MAF")
bin.snps.maf <- cut(mafs.pop, breaks=seq(0, 0.5, 0.02))
names(bin.snps.maf) <- names(mafs.pop)
table(bin.snps.maf)
tmp <- quantilesBinnedSnpData(perc.NAs.snp, bin.snps.maf)
points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"25%"],
       pch=4, col="black")
points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"75%"],
       pch=4, col="black")
segments(x0=tmp$bin.mids, y0=tmp$quant.bin.snp.dat[,"25%"],
         x1=tmp$bin.mids, y1=tmp$quant.bin.snp.dat[,"75%"], lty=2)
points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"50%"],
       pch=19, col="red")
legend("topright", legend=c("median", "first/third quartile"),
       col=c("red","black"), pch=c(19,4), bty="n")

Perform imputation

vcf.na <- genoDoses2Vcf(X.na, genomes$snp.coords, genomes$alleles)
file.prefix <- "snp-genos"
p2f.vcf.na <- paste0(file.prefix, ".vcf")
writeVcf(vcf.na, p2f.vcf.na, index=FALSE)
prefix.vcf.imp <- paste0(file.prefix, "_beagle")
cmd <- paste0("java -Xms", "1G", " -Xmx", "3G",
              " -jar ", Sys.which("beagle.jar"),
              " gtgl=", p2f.vcf.na,
              " out=", prefix.vcf.imp,
              " nthreads=", nb.cores,
              " window=1000 overlap=100 gprobs=true ne=10000")
if(! file.exists(paste0(prefix.vcf.imp, ".vcf.gz")))
  system.time(ret <- system(cmd)) # ~ 15 min with 3 cores
gdose.file <- paste0(file.prefix, "_beagle_gdose.tsv.gz")
ca.file <- paste0(file.prefix, "_beagle_coords-alleles.tsv.gz")
## the following lines are commented because of a bug in Rsamtools::indexTabix
## vcf2dosage(vcf.file=paste0(prefix.vcf.imp, ".vcf.gz"),
##            gdose.file=gdose.file, ca.file=ca.file, yieldSize=1000,
##            uncertain=TRUE)
## X.imp <- as.matrix(t(read.table(gdose.file, header=TRUE, sep="\t")))
f <- paste0(prefix.vcf.imp, ".vcf.gz.tbi")
if(file.exists(f))
  file.remove(f)
vcf.imp <- readVcf(paste0(prefix.vcf.imp, ".vcf.gz"))
## X.imp <- t(gtVcf2dose(vcf=vcf.imp))
X.imp <- t(dsVcf2dose(vcf=vcf.imp))
dim(X.imp)
stopifnot(all(dim(X.imp) == dim(X)))

Assess accuracy

length(X)
sum(X.na == X, na.rm=TRUE)
sum(is.na(X.na))
sum(is.na(X.na)) / length(X)
sum(X.imp == X)
sum(X.imp != X)
sum(X.imp != X) / sum(is.na(X.na))

Checks:

afs.imp <- estimSnpAf(X.imp)
summary(afs.imp)
A.vr.imp <- estimGenRel(X=X.imp, afs=afs.pop)
summary(diag(A.vr.imp))
summary(A.vr.imp[upper.tri(A.vr.imp)])

Plot of allelic $R^2$ as a function of the amount of missing SNP genotypes:

allelic.r2 <- sapply(colnames(X.imp), function(snp){
  suppressWarnings(cor(X[,snp], X.imp[,snp]))
})
stopifnot(all(names(miss.snps) == names(allelic.r2)))
smoothScatter(x=100 * miss.snps, y=allelic.r2, xlim=c(0,100),
              xlab="percentage of missing genotypes",
              ylab="allelic R2", las=1,
              main="Imputation accuracy (Beagle 4.1)")
bin.snps.perc.na <- cut(miss.snps, breaks=seq(0, 1, 0.05))
names(bin.snps.perc.na) <- names(miss.snps)
table(bin.snps.perc.na)
tmp <- quantilesBinnedSnpData(allelic.r2, bin.snps.perc.na)
points(100 * tmp$bin.mids, tmp$quant.bin.snp.dat[,"25%"],
       pch=4, col="black")
points(100 * tmp$bin.mids, tmp$quant.bin.snp.dat[,"75%"],
       pch=4, col="black")
segments(x0=100 * tmp$bin.mids, y0=tmp$quant.bin.snp.dat[,"25%"],
         x1=100 * tmp$bin.mids, y1=tmp$quant.bin.snp.dat[,"75%"], lty=2)
points(100 * tmp$bin.mids, tmp$quant.bin.snp.dat[,"50%"],
       pch=19, col="red")
legend("bottomleft", legend=c("median", "first/third quartile"),
       col=c("red","black"), pch=c(19,4), bty="n")

Plot of allelic $R^2$ as a function of the minor allele frequencies, similar to figure 3 of Browning and Browning (2009):

smoothScatter(mafs.pop, allelic.r2,
              xlab="minor allele frequencies",
              ylab="allelic R2", las=1,
              main="Imputation accuracy (Beagle 4.1)")
tmp <- quantilesBinnedSnpData(allelic.r2, bin.snps.maf)
points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"25%"], pch=4, col="black")
points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"75%"], pch=4, col="black")
segments(x0=tmp$bin.mids, y0=tmp$quant.bin.snp.dat[,"25%"],
         x1=tmp$bin.mids, y1=tmp$quant.bin.snp.dat[,"75%"], lty=2)
points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"50%"], pch=19, col="red")
legend("bottomright", legend=c("median", "first/third quartile"),
       col=c("red","black"), pch=c(19,4), bty="n")

Clean

file.remove(p2f.vcf.na)
for(suffix in c("log", "vcf.gz"))
  file.remove(paste0(prefix.vcf.imp, ".", suffix))
for(f in c(gdose.file, ca.file))
  if(file.exists(f))
    file.remove(f)

Appendix

t1 <- proc.time(); t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Feb. 7, 2024, 8:17 a.m.