opts_chunk$set(comment = NA, results = 'markup', tidy = F, message = F, warning = F, echo = T,
  fig.width = 12, fig.height = 6)

Preliminaries

library(utils) # for downloads
library(BEDMatrix) # to read plink data efficiently

library(RSpectra) # to perform PCA with a few components, e.g. 2

library(magrittr) 
library(dplyr) 

#library(bigcov)
library(devtools)
#load_all("~/git/variani/bigcov/")
load_all()

Function to plot PCA

plot_pop <- function(mod, labs, ...)
{
  plot(mod$vectors[, 1], mod$vectors[, 2], type = "n", 
    xlab = "PC1", ylab = "PC2", ...)
  text(mod$vectors[, 1], mod$vectors[, 2], label = labs, col = as.numeric(labs))
}

Data paths

We define a path to 1K dataset based on the output of Sys.info()[["username"]].

nodename <- Sys.info()[["user"]]

dpath <- switch(nodename,
  "your_username" = "~/Data/1KGenome/",
  "redpr" = "/udd/redpr/mixed_model/1kG/",
  stop("unknown username"))

GRM on a subset of common variants

f <- file.path(dpath, "phase3_common_EUR503_pruned_100ksubset_bed.bed")
bmat <- BEDMatrix(f)

bmat
dim(bmat)

ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)

pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"), 
    header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(pop = factor(pop)) %>% 
  left_join(data_frame(sample = ids), ., by = "sample")

system.time({
  grm <- bigdat_grm(bmat, batch_size = 1e4, verbose = 1)
})

mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'

labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "GRM on common markers, using eigs")
plot_pop(mod2,labs,main='GRM on common markers, using base::prcomp')

GRM on a subset of rare variants

f <- file.path(dpath, "phase3_rare_EUR503_nosingle_100ksubset_bed.bed")
bmat <- BEDMatrix(f)

bmat
dim(bmat)

ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)

pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"), 
    header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(pop = factor(pop)) %>% 
  left_join(data_frame(sample = ids), ., by = "sample")

system.time({
  grm <- bigdat_grm(bmat, batch_size = 1e4, verbose = 1)
})

mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'

labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "GRM on rare markers, using eigs")
plot_pop(mod2,labs,main='GRM on rare markers, using base::prcomp')

Jacard on a subset of rare variants

f <- file.path(dpath, "phase3_rare_EUR503_nosingle_100ksubset_bed.bed")
bmat <- BEDMatrix(f)

bmat
dim(bmat)

ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)

pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"), 
    header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(pop = factor(pop)) %>% 
  left_join(data_frame(sample = ids), ., by = "sample")

system.time({
  grm <- bigdat_grm(bmat, "Jacard", batch_size = 1e4, verbose = 1)
})

mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'

labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "JAC on rare markers, using eigs")
plot_pop(mod2,labs,main='JAC on rare markers, using base::prcomp')

Jacard on a subset of rare variants (<0.5%)

f <- file.path(dpath, "phase3_rare_EUR503_nosingle_100ksubset_bed.bed")
bmat <- BEDMatrix(f)

bmat
dim(bmat)

ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)

pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"), 
    header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(pop = factor(pop)) %>% 
  left_join(data_frame(sample = ids), ., by = "sample")

system.time({
  grm <- bigdat_grm(bmat, "Jacard", batch_size = 1e4, maf_max = 5e-3, verbose = 1)
})

mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'

labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "JAC on rare markers, using eigs")
plot_pop(mod2,labs,main='JAC on rare markers, using base::prcomp')

R session info

sessionInfo()

License

This document is licensed under the Creative Commons Attribution 4.0 International Public License.

Creative Commons License

References



variani/bigcov documentation built on May 3, 2019, 4:34 p.m.