options(htmltools.dir.version = FALSE, width = 65) knitr::opts_knit$set(global.par = TRUE) knitr::opts_chunk$set(echo = TRUE, fig.align='center', dev='png', dpi = 95)
In this vignette, I show how to get dosage data from a BGEN file. I provide two different solutions, as a proof of concept. Even if the second solution might be used for very large datasets, I plan to better develop the reading of BGEN files when I get the access to the full UK Biobank dataset. Indeed, it would be better to directly read probabilies from the BGEN files, without intermediaries.
This is the format used in package bigsnpr. As the name says, FBM are filebackeg big matrices which store elements on 8 bits and use a code of 256 values in order to map each possible byte with a corresponding access code. For example, I defined the following code:
bigsnpr:::CODE_DOSAGE
The use the first 4 elements to code for a genotype call (0
, 1
, 2
or missing), the 3 following ones to store an imputed genotype call (you may wonder what is the purpose of this but I can assure you that this can be cleverly used in some algorithms).
Then the 201 following values codes for dosages, rounded to two decimal places, which is similar to the 8-bit encoding of the BGEN format used for the UK Biobank. I will also use the 209-th value to code a missing dosage.
The 210-th to 256-th values are currently not used.
A similar data structure called FBM.code65636 (2 bytes) may be implemented to store dosages with a precision of at least 0.00005
. Yet, we think such a precision is not useful and a precision of 0.01
is sufficient for dosage data.
Gavin Band developed an R package to load data in BGEN format. Let us use this R package in the first solution to read dosages from BGEN files in my format bigSNP. To install this R package, follow the instructions there. In summary:
./waf-1.8.13 configure
and then ./waf-1.8.13
in a shell./build/test/test_bgen
and ./build/example/bgen_to_vcf example/example.8bits.bgen
sudo ./waf-1.8.13 install
R CMD INSTALL build/R/rbgen
bgen2FBM <- function(bgen_list) { DIM <- dim(bgen_list$data) # constructing a temporary genotype big.matrix bigGeno <- FBM.code256(DIM[2], DIM[1], code = bigsnpr:::CODE_DOSAGE) # compute dosages from probabilities dim(bgen_list$data) <- c(prod(DIM[1:2]), DIM[3]) dosages <- bgen_list$data %*% 0:2 # transform them in raw codes nonas <- !is.na(dosages) dosages_raw <- matrix(as.raw(3), nrow = DIM[1], ncol = DIM[2]) dosages_raw[nonas] <- as.raw(round(100 * dosages[nonas]) + 7) # put these dosages in the FBM bigGeno[] <- t(dosages_raw) list( genotypes = bigGeno, variants = bgen_list$variants, samples = bgen_list$samples ) }
Use the rbgen package to read BGEN data into R:
ranges = data.frame( chromosome = "01", start = 1, end = 2e9 ) bgen_file <- path.expand("~/Téléchargements/gavinband-bgen-456f4fcbc75c/example/example.8bits.bgen") if (!file.exists(paste0(bgen_file, ".bgi"))) system(glue::glue("bgenix -g {bgen_file} -index")) bgen_list <- rbgen::bgen.load(bgen_file, ranges)
Let's see the dosages we get for some given probabilities:
library(bigstatsr) test <- bgen2FBM(bgen_list) t(bgen_list$data[1:12, 1, ]) test$genotypes[1, 1:12]
BGEN files store twice information as FBM files (probabilities vs dosages) but are compressed so that FBMs are just slightly smaller files.
file.size(test$genotypes$backingfile) file.size(bgen_file)
The PGEN format is binary format of PLINK 2.0 and can store dosages. PLINK 2.0 can also convert from BGEN to PGEN files. Yet, it's not straightforward to read PGEN files either. Yet, PLINK has another function to write these dosages to a text file, which is easier to read from.
First, let us get variant and individual information:
plink2 <- path.expand("~/Téléchargements/gavinband-bgen-456f4fcbc75c/plink2") tmp <- tempfile() # Write bim and fam files from the BGEN file system(glue::glue("{plink2} --bgen {bgen_file}", " --make-just-bim --make-just-fam", " --sort-vars", " --out {tmp}")) # Get the variants and individual information fam <- data.table::fread(paste0(tmp, ".fam"), data.table = FALSE) names(fam) <- bigsnpr:::NAMES.FAM n <- nrow(fam) bim <- data.table::fread(paste0(tmp, ".bim"), data.table = FALSE) names(bim) <- bigsnpr:::NAMES.MAP m <- nrow(bim)
Then, we write the text file of dosages:
system(glue::glue("{plink2} --bgen {bgen_file}", " --export A", " --out {tmp}"))
Now, let us read these dosages into our FBM.code256 format:
/**** Simple Rcpp function to convert dosages in a data.frame to our format ****/ // [[Rcpp::depends(BH, bigstatsr)]] #include <bigstatsr/BMCodeAcc.h> // [[Rcpp::export]] void fill_FBM(Environment BM, const IntegerVector& colInd, const DataFrame& df) { Rcout << colInd << std::endl; XPtr<FBM> xpBM = BM["address"]; SubBMAcc<unsigned char> macc(xpBM, seq_len(xpBM->nrow()) - 1, colInd - 1); double dosage; for (size_t j = 0; j < macc.ncol(); j++) { NumericVector variant = df[j]; for (size_t i = 0; i < macc.nrow(); i++) { dosage = variant[i]; macc(i, j) = R_IsNA(dosage) ? 208 : round(100 * dosage + 7); } } }
# Get variants IDs to read colnames <- names(data.table::fread(paste0(tmp, ".raw"), nrows = 0)) all.equal(bim$marker.ID, as.character(test$variants$rsid)) (cols <- match(paste0(bim$marker.ID, "_G"), colnames)) # Construct a temporary genotype big.matrix bigGeno2 <- FBM.code256(n, m, code = bigsnpr:::CODE_DOSAGE) # Fill it by blocks big_apply(bigGeno2, a.FUN = function(G, ind) { df <- data.table::fread(paste0(tmp, ".raw"), select = colnames[cols[ind]]) fill_FBM(G, colInd = ind, df = df) # returns NULL }, a.combine = 'c', block.size = 20) # Construct the bigSNP object test2 <- structure(list(genotypes = bigGeno2, fam = fam, map = bim), class = "bigSNP") str(test2, max.level = 2) # Verification test2$genotypes[1, 1:12] all.equal(test2$genotypes[], test$genotypes[])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.