R.v.maj <- as.numeric(R.version$major) R.v.min.1 <- as.numeric(strsplit(R.version$minor, "\\.")[[1]][1]) if(R.v.maj < 2 || (R.v.maj == 2 && R.v.min.1 < 15)) stop("requires R >= 2.15", call.=FALSE) suppressPackageStartupMessages(library(knitr)) opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE, fig.align="center")
This document requires external packages to be available:
suppressPackageStartupMessages(library(scrm)) suppressPackageStartupMessages(library(VariantAnnotation)) suppressPackageStartupMessages(library(rutilstimflutre)) stopifnot(compareVersion("0.161.0", as.character(packageVersion("rutilstimflutre"))) != 1) stopifnot(file.exists(Sys.which("beagle.jar"))) suppressPackageStartupMessages(library(parallel)) nb.cores <- detectCores() - 1
This R chunk is used to assess how much time it takes to execute the R code in this document until the end:
t0 <- proc.time()
Set the seed:
set.seed(1859)
Simulate haplotypes and genotypes in a single population:
nb.genos <- 1*10^3 nb.chrs <- 5 Ne <- 10^4 chrom.len <- 1*10^6 mu <- 10^(-8) c.rec <- 10^(-8) genomes <- simulCoalescent(nb.inds=nb.genos, nb.reps=nb.chrs, pop.mut.rate=4 * Ne * mu * chrom.len, pop.recomb.rate=4 * Ne * c.rec * chrom.len, chrom.len=chrom.len, get.alleles=TRUE) afs.pop <- estimSnpAf(X=genomes$genos) plotHistAllelFreq(afs=afs.pop, main="Allele frequencies at HWE") mafs.pop <- estimSnpMaf(afs=afs.pop) plotHistMinAllelFreq(mafs=mafs.pop, main="Minor allele frequencies at HWE") A.vr.pop <- estimGenRel(X=genomes$genos, afs=afs.pop, method="vanraden1") imageWithScale(A.vr.pop, main="Additive genetic relationships at HWE") summary(diag(A.vr.pop)) hist(diag(A.vr.pop), col="grey", border="white") summary(A.vr.pop[upper.tri(A.vr.pop)]) hist(A.vr.pop[upper.tri(A.vr.pop)], col="grey", border="white")
Discard SNP genotypes according to a "RAD-seq" design, i.e. all genotypes and SNPs have NA, but with a skewed distribution (30% of the sites with 0-0.05% NA, 17% of the sites with 0.05-0.1% NA, 8% with 0.1-0.15% NA, and the remaining sites with a uniformly-increasing amount of NA):
genos.tokeep <- sample(rownames(genomes$genos), size=300) dim(X <- genomes$genos[genos.tokeep,]) X.na <- X bin.width <- 0.05 max.prop <- 0.9 bin.mids <- seq(from=bin.width/2, to=max.prop, by=bin.width) bin.heights <- c(0.3, 0.17, 0.08, rep((1-(0.3+0.17+0.08))/(length(bin.mids) - 3), length(bin.mids) - 3)) stopifnot(sum(bin.heights) == 1) assigned.bins <- cut(x=1:ncol(X), breaks=floor(ncol(X) * cumsum(c(0, bin.heights))), labels=FALSE) assigned.bins <- sample(assigned.bins) # shuffle the bins among SNPs table(assigned.bins) bin.probs <- seq(from=0, to=max.prop, length.out=length(bin.mids)) ## for each SNP, sample genotype indices to set as missing idx1 <- sapply(1:ncol(X), function(j){ sample.int(n=nrow(X), size=round(bin.probs[assigned.bins[j]] * nrow(X))) }) idx2 <- do.call(c, lapply(1:length(idx1), function(j){ ((j-1) * nrow(X)) + idx1[[j]] })) X.na[idx2] <- NA
Plot grid of missing SNP genotypes:
plotGridMissGenos(X=X.na)
Plot histogram of NA proportions, similar to figure S4 of Chan et al (2016)):
miss.snps <- calcFreqMissSnpGenosPerSnp(X=X.na) summary(miss.snps) hist(miss.snps, breaks=seq(0,1,0.05), col="grey", border="white", main="Distribution of the proportion of missing data per biallelic SNP", xlab="proportion of missing data at a given SNP", ylab="number of SNPs")
Stats on the mask of NA's:
mask.na <- is.na(X.na) sum(mask.na) # number of NA's sum(mask.na) / length(mask.na) # percentage of NA's perc.NAs.snp <- apply(mask.na, 2, function(mask.snp){ 100 * sum(mask.snp) / length(mask.snp) }) smoothScatter(x=mafs.pop, y=perc.NAs.snp, ylim=c(0,100), xlab="minor allele frequencies", ylab="%NA", las=1, main="Percentage of NA's as a function of MAF") bin.snps.maf <- cut(mafs.pop, breaks=seq(0, 0.5, 0.02)) names(bin.snps.maf) <- names(mafs.pop) table(bin.snps.maf) tmp <- quantilesBinnedSnpData(perc.NAs.snp, bin.snps.maf) points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"25%"], pch=4, col="black") points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"75%"], pch=4, col="black") segments(x0=tmp$bin.mids, y0=tmp$quant.bin.snp.dat[,"25%"], x1=tmp$bin.mids, y1=tmp$quant.bin.snp.dat[,"75%"], lty=2) points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"50%"], pch=19, col="red") legend("topright", legend=c("median", "first/third quartile"), col=c("red","black"), pch=c(19,4), bty="n")
vcf.na <- genoDoses2Vcf(X.na, genomes$snp.coords, genomes$alleles) file.prefix <- "snp-genos" p2f.vcf.na <- paste0(file.prefix, ".vcf") writeVcf(vcf.na, p2f.vcf.na, index=FALSE) prefix.vcf.imp <- paste0(file.prefix, "_beagle") cmd <- paste0("java -Xms", "1G", " -Xmx", "3G", " -jar ", Sys.which("beagle.jar"), " gtgl=", p2f.vcf.na, " out=", prefix.vcf.imp, " nthreads=", nb.cores, " window=1000 overlap=100 gprobs=true ne=10000") if(! file.exists(paste0(prefix.vcf.imp, ".vcf.gz"))) system.time(ret <- system(cmd)) # ~ 15 min with 3 cores gdose.file <- paste0(file.prefix, "_beagle_gdose.tsv.gz") ca.file <- paste0(file.prefix, "_beagle_coords-alleles.tsv.gz") ## the following lines are commented because of a bug in Rsamtools::indexTabix ## vcf2dosage(vcf.file=paste0(prefix.vcf.imp, ".vcf.gz"), ## gdose.file=gdose.file, ca.file=ca.file, yieldSize=1000, ## uncertain=TRUE) ## X.imp <- as.matrix(t(read.table(gdose.file, header=TRUE, sep="\t"))) f <- paste0(prefix.vcf.imp, ".vcf.gz.tbi") if(file.exists(f)) file.remove(f) vcf.imp <- readVcf(paste0(prefix.vcf.imp, ".vcf.gz")) ## X.imp <- t(gtVcf2dose(vcf=vcf.imp)) X.imp <- t(dsVcf2dose(vcf=vcf.imp)) dim(X.imp) stopifnot(all(dim(X.imp) == dim(X)))
length(X) sum(X.na == X, na.rm=TRUE) sum(is.na(X.na)) sum(is.na(X.na)) / length(X) sum(X.imp == X) sum(X.imp != X) sum(X.imp != X) / sum(is.na(X.na))
Checks:
afs.imp <- estimSnpAf(X.imp) summary(afs.imp) A.vr.imp <- estimGenRel(X=X.imp, afs=afs.pop) summary(diag(A.vr.imp)) summary(A.vr.imp[upper.tri(A.vr.imp)])
Plot of allelic $R^2$ as a function of the amount of missing SNP genotypes:
allelic.r2 <- sapply(colnames(X.imp), function(snp){ suppressWarnings(cor(X[,snp], X.imp[,snp])) }) stopifnot(all(names(miss.snps) == names(allelic.r2))) smoothScatter(x=100 * miss.snps, y=allelic.r2, xlim=c(0,100), xlab="percentage of missing genotypes", ylab="allelic R2", las=1, main="Imputation accuracy (Beagle 4.1)") bin.snps.perc.na <- cut(miss.snps, breaks=seq(0, 1, 0.05)) names(bin.snps.perc.na) <- names(miss.snps) table(bin.snps.perc.na) tmp <- quantilesBinnedSnpData(allelic.r2, bin.snps.perc.na) points(100 * tmp$bin.mids, tmp$quant.bin.snp.dat[,"25%"], pch=4, col="black") points(100 * tmp$bin.mids, tmp$quant.bin.snp.dat[,"75%"], pch=4, col="black") segments(x0=100 * tmp$bin.mids, y0=tmp$quant.bin.snp.dat[,"25%"], x1=100 * tmp$bin.mids, y1=tmp$quant.bin.snp.dat[,"75%"], lty=2) points(100 * tmp$bin.mids, tmp$quant.bin.snp.dat[,"50%"], pch=19, col="red") legend("bottomleft", legend=c("median", "first/third quartile"), col=c("red","black"), pch=c(19,4), bty="n")
Plot of allelic $R^2$ as a function of the minor allele frequencies, similar to figure 3 of Browning and Browning (2009):
smoothScatter(mafs.pop, allelic.r2, xlab="minor allele frequencies", ylab="allelic R2", las=1, main="Imputation accuracy (Beagle 4.1)") tmp <- quantilesBinnedSnpData(allelic.r2, bin.snps.maf) points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"25%"], pch=4, col="black") points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"75%"], pch=4, col="black") segments(x0=tmp$bin.mids, y0=tmp$quant.bin.snp.dat[,"25%"], x1=tmp$bin.mids, y1=tmp$quant.bin.snp.dat[,"75%"], lty=2) points(tmp$bin.mids, tmp$quant.bin.snp.dat[,"50%"], pch=19, col="red") legend("bottomright", legend=c("median", "first/third quartile"), col=c("red","black"), pch=c(19,4), bty="n")
file.remove(p2f.vcf.na) for(suffix in c("log", "vcf.gz")) file.remove(paste0(prefix.vcf.imp, ".", suffix)) for(f in c(gdose.file, ca.file)) if(file.exists(f)) file.remove(f)
t1 <- proc.time(); t1 - t0 print(sessionInfo(), locale=FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.