In gschofl/HLAsim: Simulate allelic dropouts in HLA data

knitr::opts_chunk$set(warning = FALSE, message = FALSE, collapse = TRUE, comment = "#>")
library(HLAsim)
library(dplyr)
library(ggplot2)

The following code partially reproduces Figure 2 in the manuscript.

Load data

Genotyping results for the class II HLA genes, DRB1, DQB1, and DPB1, processed between 01/01/2014 and 23/04/2015 at DKMS Life Science Lab.

data(list = c("drb1", "dqb1", "dpb1"), package = "HLAsim")
drb1

EAG (Exon Allele Group) codes for IMGT/HLA release 3.17.0. EAG codes group HLA alleles that are identical over specific exons. They are used by the allele matching algorithm implemented in neXtype, the HLA typing software used at DKMS Life Science Lab.

data(list = c("drb1_eag1412", "dqb1_eag1412", "dpb1_eag1412"), package = "HLAsim")
drb1_eag1412

Generate genotype samplers

First we create functions that allow generating random genotype samples based on observed genotype frequencies.

Genotypes have been assigned based on different IMGT/HLA releases over time. Typically this may affect the use of ambiguity codes (NMDP codes and G-groups) as new reference sequences get added to the database. Therefore, a set of exonic sequences is only guarantied to map to set of allele codes for a given IMGT/HLA release. make_genotype_sampler normalises all allele codes with respect to a single IMGT/HLA release by remapping genotypes based on a given set of EAG codes.

We create such sampler functions for each gene and country of origin of samples, respectively.

sample_drb_de <- make_genotype_sampler(drb1[provenance == "DE"], drb1_eag1412)
sample_drb_pl <- make_genotype_sampler(drb1[provenance == "PL"], drb1_eag1412)
sample_drb_uk <- make_genotype_sampler(drb1[provenance == "UK"], drb1_eag1412)

sample_dqb_de <- make_genotype_sampler(dqb1[provenance == "DE"], dqb1_eag1412)
sample_dqb_pl <- make_genotype_sampler(dqb1[provenance == "PL"], dqb1_eag1412)
sample_dqb_uk <- make_genotype_sampler(dqb1[provenance == "UK"], dqb1_eag1412)

sample_dpb_de <- make_genotype_sampler(dpb1[provenance == "DE"], dpb1_eag1412)
sample_dpb_pl <- make_genotype_sampler(dpb1[provenance == "PL"], dpb1_eag1412)
sample_dpb_uk <- make_genotype_sampler(dpb1[provenance == "UK"], dpb1_eag1412)

A sampler is a function that takes a table of DNA concentrations, conc, a sample size, n, and an interval to bin DNA concentrations, bin_size as arguments. It will generate n random genotyps based on observed genotype frequencies and randomly associate these geneotypes with DNA concentrations.

sample_drb_de

Load DNA-concentration/error-rate data

DNA concentration measurements for approx. 24,000 random samples from Germany, Poland, and the UK, respectively.

data("concentration", package = "HLAsim")
concentration

An empirical cumulative distribution function describing DNA-concentration-dependent error rates based on data for HLA-A extracted for the time period between 01/09/2014 and 23/05/2015.

data("cdfA", package = "HLAsim")
plot(cdfA)

Resample genotypes

# Total number of random samples
n <- 1e6
# Binwidth for DNA concentrations
bin_width <- 2.5

## DRB1
gtbl_drb1_de <- sample_drb_de(concentration[provenance == "DE"], n, bin_width)
gtbl_drb1_pl <- sample_drb_pl(concentration[provenance == "PL"], n, bin_width)
gtbl_drb1_uk <- sample_drb_uk(concentration[provenance == "UK"], n, bin_width)

## DQB1
gtbl_dqb1_de <- sample_dqb_de(concentration[provenance == "DE"], n, bin_width)
gtbl_dqb1_pl <- sample_dqb_pl(concentration[provenance == "PL"], n, bin_width)
gtbl_dqb1_uk <- sample_dqb_uk(concentration[provenance == "UK"], n, bin_width)

## DPB1
gtbl_dpb1_de <- sample_dpb_de(concentration[provenance == "DE"], n, bin_width)
gtbl_dpb1_pl <- sample_dpb_pl(concentration[provenance == "PL"], n, bin_width)
gtbl_dpb1_uk <- sample_dpb_uk(concentration[provenance == "UK"], n, bin_width)

gtbl_drb1_de

Inject alleleic dropouts

We inject errors by randomly chosing either exon 2, exon 3, or both for allelic dropout. The odds of a sequence being dropped depend on the experimental setup: At DKMS LSL exons 2 of DRB1 and DPB1 are amplified in two PCR reactions each, whereas exon 3 in amplified in a single PCR reaction. In this case, the odds in favour of dropping a sequence from exon 2 rather than exon 3 are 1:4. The overall proportion of errors (perr) is chosen at random.

Note, that tu fully reproduce Figure 2 one would need to simulated data over a range of error rates and odds.

## DRB1
err_tbl_drb1_de <- inject_errors(gtbl_drb1_de, cdfA, perr = 0.01, odds = 0.25)
err_tbl_drb1_pl <- inject_errors(gtbl_drb1_pl, cdfA, perr = 0.01, odds = 0.25)
err_tbl_drb1_uk <- inject_errors(gtbl_drb1_uk, cdfA, perr = 0.01, odds = 0.25)

## DQB1
err_tbl_dqb1_de <- inject_errors(gtbl_dqb1_de, cdfA, perr = 0.01, odds = 1)
err_tbl_dqb1_pl <- inject_errors(gtbl_dqb1_pl, cdfA, perr = 0.01, odds = 1)
err_tbl_dqb1_uk <- inject_errors(gtbl_dqb1_uk, cdfA, perr = 0.01, odds = 1)

## DPB1
err_tbl_dpb1_de <- inject_errors(gtbl_dpb1_de, cdfA, perr = 0.01, odds = 0.25)
err_tbl_dpb1_pl <- inject_errors(gtbl_dpb1_pl, cdfA, perr = 0.01, odds = 0.25)
err_tbl_dpb1_uk <- inject_errors(gtbl_dpb1_uk, cdfA, perr = 0.01, odds = 0.25)

err_tbl_drb1_de

Effects of gene and sample provenance

Finally we can summarise the effects of gene and sample provenance on error categories

library(foreach)
library(iterators)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)

summarise_gtbl <- function(x) {
  tbl <- summary(x) %>%
    dplyr::select(-pHet, -nerr) %>%
    tidyr::gather(type, proportion, pHom:pNA) %>%
    dplyr::tbl_df() %>% 
    dplyr::mutate(type = factor(type, levels = c("pHom", "pHetIdent", "pHetDiff", "pNA")))
  levels(tbl$type) <- c("erroneous HM", "unchanged HT", "erroneous HT", "incomp. exons")
  tbl
}

tbls <- iter(ls(pattern = "^err_tbl_.+_.+"))
tbls.summary <- tbl_dt(foreach(tbl = tbls, .combine = "rbind") %do% {
  gene <- toupper(strsplit(tbl, "_")[[1]][3])
  provenance <- toupper(strsplit(tbl, "_")[[1]][4])
  summarise_gtbl(get(tbl)) %>%
    mutate(provenance = provenance, gene = factor(gene, levels = c("DRB1", "DQB1", "DPB1")))
})

ggplot(tbls.summary, aes(x = type, y = proportion, fill = provenance)) +
  geom_bar(position = "dodge", stat = "identity", colour = "white") +
  facet_grid(. ~ gene) +
  geom_text(aes(label = ifelse(proportion > 0.01, round(proportion, 2), ""),
                hjust = ifelse(proportion > 0.2, 1.2, -0.2)),
            angle = 90, colour = "black", position = position_dodge(0.9), size = 2.5) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(x = "Genotyping error category", y = "Proportion", fill = "Provenance") +
  theme(axis.title.x = element_text(vjust = -0.5),
        axis.title.y = element_text(vjust = 1),
        axis.text.x = element_text(size = 8, angle = 30, vjust = 1, hjust = 1)) +
  theme_hc() + 
  theme(legend.position = "top")

Generating training data for prediction of spurious typings

Genotype frequency tables have been generated when we set up the sampler functions

gtf_table(sample_drb_de)

To generate training data for probabilistic learning of genotyping errors based on DNA concentration and allele frequencies we procede by mapping the observed and expected genotype frequencies to the simulated error tables using map_genotype_frequencies.

drb1.01.de <- map_genotype_frequencies(err_tbl_drb1_de) %>%
  dplyr::mutate(provenance = "DE")
drb1.01.de %>% 
  dplyr::group_by(Class) %>% 
  dplyr::sample_n(5)