# http://www.well.ox.ac.uk/~gav/bgen_format/
# library(magrittr)
# file <- tempfile(fileext = ".bgen")
# system.file("testdata", "bgen_example.rds", package = "bigsnpr") %>%
# readRDS() %>% writeBin(bgen_file, useBytes = TRUE)
# system.file("testdata", "bgi_example.rds", package = "bigsnpr") %>%
# readRDS() %>% writeBin(paste0(bgen_file, ".bgi"), useBytes = TRUE)
file <- "tmp-data/test.bgen"
file_con <- file(file, open = "rb", raw = TRUE)
# The first four bytes
first4 <- readBin(file_con, what = 1L, size = 4) ## 15,609,036
# The header block
header <- readBin(file_con, what = raw(), n = first4 - 4)
writeBin(header, tmp <- tempfile())
header_con <- file(tmp, open = "rb", raw = TRUE)
L_H <- readBin(header_con, what = 1L, size = 4) ## 20
stopifnot(L_H < first4)
M <- readBin(header_con, what = 1L, size = 4) ## 7,402,791
N <- readBin(header_con, what = 1L, size = 4) ## 487,409
magic_number <- readBin(header_con, what = raw(), n = 4)
stopifnot(identical(magic_number, charToRaw("bgen")))
flags <- readBin(header_con, what = raw(), n = 4)
flags_bits <- rawToBits(flags)
flags_bits[1:2] ## 01 00 -> 1 -> use zlib
flags_bits[3:6] ## 00 01 00 00 -> 2 -> layout 2
flags_bits[32] ## 01 -> 1 -> sample identifiers
tail(header, 4)
# Variant identifying data
L_id <- readBin(file_con, what = 1L, size = 2, signed = FALSE) ## 12
id <- readBin(file_con, what = raw(), n = L_id)
rawToChar(id) ## "1:10177_A_AC"
L_rsid <- readBin(file_con, what = 1L, size = 2, signed = FALSE) ## 11
rsid <- readBin(file_con, what = raw(), n = L_rsid)
rawToChar(rsid) ## "rs367896724"
L_chr <- readBin(file_con, what = 1L, size = 2, signed = FALSE) ## 2
chr <- readBin(file_con, what = raw(), n = L_chr)
rawToChar(chr) ## "01"
pos <- readBin(file_con, what = 1L, size = 4) ## 10177
K <- readBin(file_con, what = 1L, size = 2, signed = FALSE) ## 2
stopifnot(K == 2)
L_a1 <- readBin(file_con, what = 1L, size = 4) ## 1
a1 <- readBin(file_con, what = raw(), n = L_a1)
rawToChar(a1) ## "A"
L_a2 <- readBin(file_con, what = 1L, size = 4) ## 2
a2 <- readBin(file_con, what = raw(), n = L_a2)
rawToChar(a2) ## "AC"
# possibly more alleles... (but here, only 2!)
C <- readBin(file_con, what = 1L, size = 4) ## 930,596
D <- readBin(file_con, what = 1L, size = 4) ## 1,462,237
probas_compressed <- readBin(file_con, what = raw(), n = C - 4) ## 78 9c ..
# Next variant
L_id <- readBin(file_con, what = 1L, size = 2, signed = FALSE) ## 12
id <- readBin(file_con, what = raw(), n = L_id)
rawToChar(id) ## "1:10235_T_TA"
# decompress data
probas <- memDecompress(probas_compressed, type = "gzip")
probas2 <- gzmem::mem_inflate(probas_compressed, format = "zlib", r_guess_size = D + 0)
identical(probas, probas2)
microbenchmark::microbenchmark(
memDecompress(probas_compressed, type = "gzip"),
gzmem::mem_inflate(probas_compressed, format = "zlib", r_guess_size = D + 0)
) # absolutely no gain...
# data block
writeBin(probas, tmp <- tempfile())
probas_con <- file(tmp, open = "rb", raw = TRUE)
N2 <- readBin(probas_con, what = 1L, size = 4, signed = FALSE)
stopifnot(N2 == N)
K2 <- readBin(probas_con, what = 1L, size = 2, signed = FALSE)
stopifnot(K2 == K)
P_min <- readBin(probas_con, what = 1L, size = 1, signed = FALSE) ## 2
P_max <- readBin(probas_con, what = 1L, size = 1, signed = FALSE) ## 2
ploidy <- readBin(probas_con, what = 1L, size = 1, signed = FALSE, n = N)
stopifnot(all(ploidy == 2)) ## diploid + no missing value
phased <- readBin(probas_con, what = 1L, size = 1, signed = FALSE) ## no
B <- readBin(probas_con, what = 1L, size = 1, signed = FALSE) ## 8
stopifnot(B == 8)
probas_read <- readBin(probas_con, what = 1L, size = 1, signed = FALSE, n = 5 * N)
stopifnot(length(probas_read) == 2 * N)
probas_read
summary(dosage <- 2 - (2 * probas_read[c(TRUE, FALSE)] + probas_read[c(FALSE, TRUE)]) / 255)
stopifnot(D == (4 + 2 + 1 + 1 + N + 1 + 1 + 2 * N))
stopifnot(D == (10 + 3 * N))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.