opts_chunk$set(comment = NA, results = 'markup', tidy = F, message = F, warning = F, echo = T,
  fig.width = 6, fig.height = 6)

About

Population stratification is an important part of association studies conducted in a population consisting of different sub-populations. The standard approach exploited in the analysis of common genetic markers is to estimate generic relatedness matrix (GRM) [@Patterson2006]. Recently, [@Prokopenko2015] proposed to use Jacard similarity matrix in the analysis of rare variants. Here, we show practical examples on population stratification using R package bigcov in the following settings.

| Simulation | MAF range | Number of samples | Number of Markers | Relationship | |--|--|--|--|--| | Common SNPs | [0.05; 0.5] | 200 | 1,000 | GRM | | Rare SNPs | [0.001; 0.005] | 200 | 1,000 | GRM | | Rare SNPs | [0.001; 0.005] | 200 | 1,000, 2,000 & 4,000 | Jacard |

We will use the same toy example of two population as in R package jacpop reference manual (two populations with Fst = 0.1). Also, we will show an example of analysis for data stored in plink format.

Work-horse functions in bigcov

Functions bigdat_grm and bigdat_jacard take input genotype data and convert them in bigdat format (also part of bigcov), which has a few practical features:

Preliminaries

library(BEDMatrix) # to read plink data efficiently

library(RSpectra) # to perform PCA with a few components, e.g. 2

#library(bigcov)
library(devtools)
load_all("~/git/variani/bigcov/")

Function to simulate two populations

A toy example is adopted from https://cran.r-project.org/web/packages/jacpop.

sim_pop <- function(N = 200, M = 1000, Fst = 0.1, maf_max = 0.5, maf_min = 0.05, 
  seed = 1)
{
  set.seed(seed)

  maf_values <- runif(M, maf_min, maf_max)

  freq1 <- sapply(1:M, function(i) rbeta(1, 
    maf_values[i] * (1 - Fst) / Fst, 
    (1 - maf_values[i]) * (1 - Fst) / Fst))
  freq2 <- sapply(1:M, function(i) rbeta(1, 
    maf_values[i] * (1 - Fst) / Fst, 
    (1 - maf_values[i]) * (1 - Fst) / Fst))

  gdat1 <- sapply(1:M, function(i) sample(c(0, 1, 2), N, replace = TRUE,
    prob = c(((1 - freq1[i])^2), (2 * freq1[i] * (1 - freq1[i])), (freq1[i]^2))))
  gdat2 <- sapply(1:M, function(i) sample(c(0, 1, 2), N, replace = TRUE,
    prob = c(((1 - freq2[i])^2), (2 * freq2[i] * (1 - freq2[i])), (freq2[i]^2))))

  gdat <- rbind(gdat1, gdat2)

  return(gdat)
}

Function to plot PCA

plot_pop <- function(mod, labs, ...)
{
  if(missing(labs)) {
    labs <- factor(rep("Pop", nrow(mod$vectors)))
  }

  plot(mod$vectors[, 1], mod$vectors[, 2], type = "n", 
    xlab = "PC1", ylab = "PC2", ...)
  text(mod$vectors[, 1], mod$vectors[, 2], label = labs, col = as.numeric(labs))
}

Default simulation parameters

N <- 200 # the number of individuals per population
M <- 1e3 # the number of SNPs
Fst <- 0.01

Common SNPs

# simulated genotype data
gdat <- sim_pop(N = N, M = M, Fst = Fst, maf_max = 0.5, maf_min = 0.05, seed = 1)

# compute GRM
A <- bigdat_grm(gdat, check_na = FALSE, num_batches = 2, verbose = 2)

# copmute PCA on GRM
mod <- eigs(A, k = 2)

# plot
labs <- factor(c(rep("Pop 1", N), rep("Pop 2", N)))
plot_pop(mod, labs, main = "GRM on common SNPs")

Rare SNPs

Population stratification using GRM

# simulated genotype data
gdat <- sim_pop(N = N, M = M, Fst = Fst, maf_max = 0.005, maf_min = 0.001, seed = 1)

# compute GRM
A <- bigdat_grm(gdat, check_na = FALSE, num_batches = 2, verbose = 2)

# copmute PCA on GRM
mod <- eigs(A, k = 2)

# plot
labs <- factor(c(rep("Pop 1", N), rep("Pop 2", N)))
plot_pop(mod, labs, main = "GRM on rare SNPs")

Population stratification using Jacard

# simulated genotype data
# - the same data as in the previous example, as the seed value is the same (1)
gdat <- sim_pop(N = N, M = M, Fst = Fst, maf_max = 0.005, maf_min = 0.001, seed = 1)

# compute Jacard
A <- bigdat_jacard(gdat, num_batches = 2, verbose = 2)

# copmute PCA on GRM
mod <- eigs(A, k = 2)

# plot
labs <- factor(c(rep("Pop 1", N), rep("Pop 2", N)))
plot_pop(mod, labs, main = "Jacard on rare SNPs")

Population stratification using Jacard and more SNPs

# simulated genotype data
# - the simulated data is different, as we double the number of SNPs
M2 <- 2*M
gdat <- sim_pop(N = N, M = M2, Fst = Fst, maf_max = 0.005, maf_min = 0.001, seed = 1)

# compute Jacard
A <- bigdat_jacard(gdat, num_batches = 2, verbose = 2)

# copmute PCA on GRM
mod <- eigs(A, k = 2)

# plot
labs <- factor(c(rep("Pop 1", N), rep("Pop 2", N)))
plot_pop(mod, labs, main = "Jacard on rare SNPs")

Population stratification using Jacard and even more SNPs

M4 <- 4*M
gdat <- sim_pop(N = N, M = M4, Fst = Fst, maf_max = 0.005, maf_min = 0.001, seed = 1)

# compute Jacard
A <- bigdat_jacard(gdat, num_batches = 2, verbose = 2)

# copmute PCA on GRM
mod <- eigs(A, k = 2)

# plot
labs <- factor(c(rep("Pop 1", N), rep("Pop 2", N)))
plot_pop(mod, labs, main = "Jacard on rare SNPs")

Example with plink files

We use dummy data in plink format from R package BEDMatrix.

path <- system.file("extdata", "example.bed", package = "BEDMatrix")
bmat <- BEDMatrix(path)

bmat

dim(bmat)

bmat[1:5, 1:5]
N <- nrow(bmat)

grm <- bigdat_grm(bmat, num_batches = 2, verbose = 2)

mod <- eigs(grm, k = 2)

labs <- factor(rep("Dummy Pop", N))
plot_pop(mod, labs, main = "Dummy Data in plink format")

Future work

We need to test examples not covered in this document:

That will give us an idea whether R is powerful enough for big data in genetics (spoiler: yes, it is).

R session info

sessionInfo()

License

This document is licensed under the Creative Commons Attribution 4.0 International Public License.

Creative Commons License

References



variani/bigcov documentation built on May 3, 2019, 4:34 p.m.