knitr::opts_chunk$set(echo = TRUE, fig.align = "center")
library(bigsnpr) library(ggplot2)
Let us use a subsetted version of the 1000 Genomes project data we provide. Some quality control has already been done; otherwise, you can use snp_plinkQC()
.
bedfile <- download_1000G("tmp-data")
First, let us detect all pairs of related individuals.
plink2 <- download_plink2("tmp-data") rel <- snp_plinkKINGQC( plink2.path = plink2, bedfile.in = bedfile, thr.king = 2^-4.5, make.bed = FALSE, ncores = nb_cores() ) str(rel)
We then compute PCA without using the related individuals. Function bed_autoSVD()
should take care of Linkage Disequilibrium (LD). We have to compute the PCA while removing all variants from one chromosome. We can still precompute the pruning step to do it only once.
(obj.bed <- bed(bedfile)) fam2 <- bigreadr::fread2(sub_bed(bedfile, ".fam2")) ind.row <- which(!obj.bed$fam$sample.ID %in% rel$IID2 & fam2$`Super Population` == "EUR") ind.keep <- bed_clumping(obj.bed, ind.row = ind.row, ncores = nb_cores())
CHR <- obj.bed$map$chromosome POS <- obj.bed$map$physical.pos library(bigparallelr) registerDoParallel(cl <- makeCluster(nb_cores())) all_stats <- foreach(chr = 1:22, .combine = "rbind") %dopar% { ## LD scores ind.chr <- which(CHR == chr) library(bigsnpr) ## Should implement bed_cor() directly G <- snp_attach(snp_readBed2(bedfile, tempfile(), ind.row, ind.chr))$genotypes LD <- snp_cor(G, infos.pos = POS[ind.chr]) # more lenient alpha? library(Matrix) LD_scores <- colSums(LD ** 2) ## PCA obj.svd <- bed_autoSVD( obj.bed, ind.row = ind.row, k = 10, thr.r2 = NA, ind.col = ind.keep[CHR[ind.keep] != chr]) ## Z-scores (association with PC scores) ~ loadings Z <- bigsnpr:::multLinReg(obj.bed, ind.row, ind.chr, U = obj.svd$u) cbind(LD_scores, Z) } stopCluster(cl) saveRDS(all_stats, "tmp-data/ldscores-loadings.rds")
LD_scores <- all_stats[, 1] Z2 <- all_stats[, -1] ** 2 cor(Z2, LD_scores, use = "pairwise.complete.obs") # 0.066223166 # 0.058258742 # -0.002232321 # 0.024427324 # -0.029319222 # -0.007915754 # -0.012346952 # -0.018130519 # -0.010315609 # 0.001048635 bin_plot <- function(x, N = 10) { grp <- cut(x, quantile(x, seq_len(N) / N, na.rm = TRUE)) plot(by(LD_scores, grp, median), by(x, grp, mean)) } bin_plot(Z2[, 1]) bin_plot(Z2[, 2]) bin_plot(Z2[, 3]) bin_plot(Z2[, 4]) bin_plot(Z2[, 5]) bin_plot(Z2[, 6]) bin_plot(Z2[, 7]) bin_plot(Z2[, 8])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.