Andrey Ziyatdinov
r Sys.Date()
library(utils) # for downloads
library(BEDMatrix) # to read plink data efficiently
library(RSpectra) # to perform PCA with a few components, e.g. 2
library(magrittr)
library(dplyr)
#library(bigcov)
library(devtools)
#load_all("~/git/variani/bigcov/")
load_all()
plot_pop <- function(mod, labs, ...)
{
plot(mod$vectors[, 1], mod$vectors[, 2], type = "n",
xlab = "PC1", ylab = "PC2", ...)
text(mod$vectors[, 1], mod$vectors[, 2], label = labs, col = as.numeric(labs))
}
We define a path to 1K dataset based on the output of Sys.info()[["nodename"]]
.
nodename <- Sys.info()[["nodename"]]
dpath <- switch(nodename,
"tau" = "~/Data/1KGenome/",
"your_nodename" = "/udd/redpr/mixed_model/1kG/",
stop("unknown nodename"))
f <- file.path(dpath, "phase3_common_EUR503_pruned_100ksubset_bed.bed")
bmat <- BEDMatrix(f)
bmat
BEDMatrix: 503 x 100000 [/home/andrey/Data/1KGenome//phase3_common_EUR503_pruned_100ksubset_bed.bed]
dim(bmat)
[1] 503 100000
ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)
[1] "HG00096" "HG00097" "HG00099" "HG00100" "HG00101" "HG00102"
pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"),
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(pop = factor(pop)) %>%
left_join(data_frame(sample = ids), ., by = "sample")
system.time({
grm <- bigdat_grm(bmat, batch_size = 1e4, verbose = 1)
})
-- filters: callrate / mono / / / check_na
- bigdat_tcrossprod: computing `tcrossprod`: 10 batches
- clean markers used in the analysis: 100000 / 100000
user system elapsed
45.256 1.804 47.180
mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'
labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "GRM on common markers, using eigs")
plot_pop(mod2,labs,main='GRM on common markers, using base::prcomp')
f <- file.path(dpath, "phase3_rare_EUR503_nosingle_100ksubset_bed.bed")
bmat <- BEDMatrix(f)
bmat
BEDMatrix: 503 x 100000 [/home/andrey/Data/1KGenome//phase3_rare_EUR503_nosingle_100ksubset_bed.bed]
dim(bmat)
[1] 503 100000
ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)
[1] "HG00096" "HG00097" "HG00099" "HG00100" "HG00101" "HG00102"
pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"),
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(pop = factor(pop)) %>%
left_join(data_frame(sample = ids), ., by = "sample")
system.time({
grm <- bigdat_grm(bmat, batch_size = 1e4, verbose = 1)
})
-- filters: callrate / mono / / / check_na
- bigdat_tcrossprod: computing `tcrossprod`: 10 batches
- clean markers used in the analysis: 100000 / 100000
user system elapsed
44.808 1.632 46.665
mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'
labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "GRM on rare markers, using eigs")
plot_pop(mod2,labs,main='GRM on rare markers, using base::prcomp')
f <- file.path(dpath, "phase3_rare_EUR503_nosingle_100ksubset_bed.bed")
bmat <- BEDMatrix(f)
bmat
BEDMatrix: 503 x 100000 [/home/andrey/Data/1KGenome//phase3_rare_EUR503_nosingle_100ksubset_bed.bed]
dim(bmat)
[1] 503 100000
ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)
[1] "HG00096" "HG00097" "HG00099" "HG00100" "HG00101" "HG00102"
pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"),
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(pop = factor(pop)) %>%
left_join(data_frame(sample = ids), ., by = "sample")
system.time({
grm <- bigdat_grm(bmat, "Jacard", batch_size = 1e4, verbose = 1)
})
-- filters: callrate / mono / / / check_na
- bigdat_tcrossprod: computing `tcrossprod`: 10 batches
- clean markers used in the analysis: 100000 / 100000
user system elapsed
11.616 0.668 12.492
mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'
labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "JAC on rare markers, using eigs")
plot_pop(mod2,labs,main='JAC on rare markers, using base::prcomp')
f <- file.path(dpath, "phase3_rare_EUR503_nosingle_100ksubset_bed.bed")
bmat <- BEDMatrix(f)
bmat
BEDMatrix: 503 x 100000 [/home/andrey/Data/1KGenome//phase3_rare_EUR503_nosingle_100ksubset_bed.bed]
dim(bmat)
[1] 503 100000
ids <- rownames(bmat)
ids <- strsplit(ids, "_") %>% sapply(function(x) x[1])
head(ids)
[1] "HG00096" "HG00097" "HG00099" "HG00100" "HG00101" "HG00102"
pop <- read.table(file.path(dpath, "pop_label_phase3_2504_without_rels"),
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(pop = factor(pop)) %>%
left_join(data_frame(sample = ids), ., by = "sample")
system.time({
grm <- bigdat_grm(bmat, "Jacard", batch_size = 1e4, maf_max = 5e-3, verbose = 1)
})
-- filters: callrate / mono / / maf_max / check_na
- bigdat_tcrossprod: computing `tcrossprod`: 10 batches
- clean markers used in the analysis: 75197 / 100000
user system elapsed
12.312 0.100 12.461
mod <- eigs(grm, k = 2)
mod2<-prcomp(grm)
names(mod2)[2]<-'vectors'
labs <- pop$pop
par(mfrow=c(1, 2))
plot_pop(mod, labs, main = "JAC on rare markers, using eigs")
plot_pop(mod2,labs,main='JAC on rare markers, using base::prcomp')
sessionInfo()
R version 3.3.3 (2017-03-06)
Platform: i686-pc-linux-gnu (32-bit)
Running under: Ubuntu 14.04.5 LTS
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bigcov_0.1.1 dplyr_0.5.0 magrittr_1.5 RSpectra_0.12-0
[5] BEDMatrix_1.4.0 rmarkdown_1.3 knitr_1.15.1 devtools_1.12.0
loaded via a namespace (and not attached):
[1] codetools_0.2-15 digest_0.6.10 htmltools_0.3.5
[4] R6_2.2.0 assertthat_0.1 rprojroot_1.1
[7] grid_3.3.3 stringr_1.1.0 bigmemory.sri_0.1.3
[10] testthat_1.0.2 tibble_1.2 lattice_0.20-34
[13] DBI_0.5-1 foreach_1.4.3 roxygen2_5.0.1
[16] iterators_1.0.8 Matrix_1.2-7.1 plyr_1.8.4
[19] crochet_1.0.0 data.table_1.10.5 stringi_1.1.2
[22] evaluate_0.10 yaml_2.1.14 tools_3.3.3
[25] parallel_3.3.3 withr_1.0.2 synchronicity_1.1.9.1
[28] lazyeval_0.2.0 crayon_1.3.2 backports_1.0.4
[31] memoise_1.0.0 bigmemory_4.5.19 Rcpp_0.12.11
[34] doParallel_1.0.10
This document is licensed under the Creative Commons Attribution 4.0 International Public License.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.