#' Generate a genotype resampling function.
#'
#' @note
#' The next step is to inject errors in the sampled genotypes:
#' \code{\link{inject_errors}}.
#' @param x A <\code{\link{HLA}}> object.
#' @param eag An <\code{\link{eag_table}}> object.
#' @return A constructor function for <\code{\link{geno_table}}> objects.
#' @family simulation functions
#' @export
#' @examples
#' \dontrun{
#' ## Extract HLA-DPB1 genotype frequencies
#' dpb1 <- HLA("DPB1", "01/01/2014", "23/03/2015")
#'
#' ## Restrict the data to the German sample
#' dpb1.de <- dpb1[provenance == "DE"]
#'
#' ## Generate an EAG table
#' dpb1_eag1412 <- eag_table(gene = "DPB1", nextype_basis_id = "1412")
#'
#' ## Sample a distribution of DNA concentrations
#' concentration <- sample_dna_concentration(dpb1.de, n = 24000, ncores = 8)
#'
#' ## Generate a sampling function
#' sample_dpb1_de <- make_genotype_sampler(dpb1.de, dpb1_eag1412)
#'
#' ## Sample genotypes
#' n <- 10000
#' bin_size <- 3
#' rs <- sample_dpb1_de(concentration, n, bin_size)
#' rs
#' }
make_genotype_sampler <- function(x, eag) {
assertive::assert_is_any_of(x, "HLA")
assertive::assert_is_any_of(eag, "eag_tbl")
assertive::assert_are_identical(gene(x), gene(eag))
foreach <- foreach::foreach
`%do%` <- foreach::`%do%`
icount <- iterators::icount
iter <- iterators::iter
## check if we run in knitr
in_knitr <- isTRUE(getOption('knitr.in.progress'))
## Remove low resolution four-digit codes and cases of unknown alleles
x <- clean_hla_data(x)
xtbl <- x$get_table()
## create lookup tables mapping rep_alleles (before assignment of ambigutiy
## codes), eag alleles (after assignment of ambiguity codes), eag_num, and allele_num
if (!in_knitr) {
message("Creating lookup table and mappers ...\n", sep = "")
}
alleles <- xtbl[, .(allele1, allele2)]
alleles <- hla_sort(unique(c(alleles$allele1, alleles$allele2)))
## a list of lookup tables matching alleles and EAG numbers
## for exons 2 and 3, jokers, and partials
lookup <- lookup_list(alleles, eag)
## a function that maps eag_nums to alleles
map <- make_mapper(lookup)
## a function that maps alleles to eag_nums
remap <- make_remapper(lookup)
## check the genotypes in x for mappability
unique_genotypes <- unique(xtbl$genotype)
if (!in_knitr) {
message("Ensuring mappability of ", length(unique_genotypes),
" genotypes. This step will take a while ...\n", sep = "")
}
pbar <- utils::txtProgressBar(min = 0, max = length(unique_genotypes), style = 3)
on.exit(close(pbar))
## DEBUGGING START ####
# reps <- string2allele(unique(xtbl$genotype))
# i <- 7
# (rep <- reps[[i]])
# (nums <- remap(rep))
# (alls <- map(nums))
# i <- i + 1
# memoise::forget(remap)
# debug(remap)
# undebug(remap)
# memoise::forget(map)
# debug(map)
# undebug(remap)
#
# af <- x$allele_frequency()
# gtf <- x$genotype_frequency()
# a <- "02:01:01"
# b <- "02:02:02"
# af[starts_with(a, allele)]
# af[starts_with(b, allele)]
# gtf[genotype == paste(hla_sort(c(a, b)), collapse = "/")]
#
# sink(file = "remap_dqb1.log")
# i <- 1
# for (rep in string2allele(unique(xtbl$genotype))) {
# nums <- remap(rep)
# alls <- map(nums)
# ident <- rep == alls
# cat("[", i, "] ---------- ident = ", ident, "\n", sep = "")
# print(rep)
# print(nums)
# print(alls)
# i <- i + 1
# }
# sink()
## DEBUGGING END####
iREP <- iter(string2allele(unique_genotypes))
unique_remapped_genotypes <- foreach(rep = iREP, cnt = icount()) %do% {
if (!in_knitr)
utils::setTxtProgressBar(pb = pbar, value = cnt)
tryCatch(map(remap(rep)), error = function(e) {
message("Error '", e$message, "' in ", cnt)
})
}
dt_remap <- data.table(
genotype = unique_genotypes,
genotype2 = allele2string(unique_remapped_genotypes),
eag_status = purrr::map_chr(unique_remapped_genotypes, ~eag_status(remap(.)))
)
setkeyv(dt_remap, "genotype")
## check for and remove genotypes that could not
## be mapped reliably: NA/NA or something like 03:/06:02:02
## dt_remap[!is.genotype(genotype2)]
invalid_gtps <- dt_remap[!is.genotype(genotype2), genotype]
setkeyv(xtbl, "genotype")
if (length(invalid_gtps) > 0) {
xtbl <- xtbl[!invalid_gtps]
dt_remap <- dt_remap[!invalid_gtps]
}
## Calculate genotype frequencies based on
## the remapped genotypes
tbl2 <- xtbl[dt_remap]
tbl2 <- tbl2[, genotype := NULL]
setnames(tbl2, "genotype2", "genotype")
tbl2 <- tbl2[, `:=`(
allele1 = allele1(genotype),
allele2 = allele2(genotype),
zygosity = ifelse(allele1 == allele2, "homozygous", "heterozygous")
)]
setkeyv(tbl2, "lims_donor_id")
x$set_table(tbl2)
gtfrq <- gtf_table(x)
## Globals:
## gtfrq
## Passed on in enclosing environment:
## map
## remap
## cleanup
rm(
`%do%`, cnt, dt_remap, eag, foreach, icount, invalid_gtps,
iREP, iter, lookup, rep, unique_remapped_genotypes, tbl2,
unique_genotypes, x, xtbl
)
structure(function(conc, n, bin_size) {
## Generate a random sample of genotypes based on the observed genotype frequency
gtp <- as.character(gtfrq[, genotype])
frq <- gtfrq[, pobs]
rgeno <- sample(gtp, size = n, replace = TRUE, prob = frq)
## EAG status of the random sample
eag_status <- gtfrq[rgeno][, eag_status]
## Is the genotype homozygous?
rzygosity <- ifelse(is_homozygous(rgeno), "homozygous", "heterozygous")
## Sample the emperically observed DNA concentration
## and distribute them randomly to the generated genotypes.
rconc <- sample(conc[, conc], size = n, replace = TRUE)
## Divide the range of DNA concentrations into slices.
breaks <- slice_bins(conc, bin_size)
bins <- cut(rconc, breaks, include.lowest = TRUE, dig.lab = 4, ordered_result = TRUE)
structure(
list(
samples = dtplyr::tbl_dt(data.table(
genotype = rgeno,
eag_status = eag_status,
zygosity = rzygosity,
conc = rconc,
bins = bins,
idx = seq_len(n),
key = "idx"
)),
errors = dtplyr::tbl_dt(data.table(
genotype = character(),
eag_status = character(),
zygosity = character(),
idx = integer(),
key = "idx"
))
),
## pass along the closure containing the mapper, remapper, and gtf_table
penv = parent.env(environment(NULL)),
breaks = breaks,
class = c("geno_table", "list")
)
}, class = c("genotype_sampler", "function"))
}
#' Class: geno_table
#'
#' Constructor function for a <\code{geno_table}> object. Created
#' by the factory function \code{\link{make_genotype_sampler}}.
#'
#' A function that takes a vector of DNA concentrations
#' to draw \emph{n} random genotypes based on observed genotype
#' frequencies and randomly attributes DNA concentrations to these
#' genotypes.
#'
#' @usage geno_table(conc, n, bin_size)
#' @return
#' A <\code{geno_table}> object with the slots:
#' \itemize{
#' \item "samples": table with the fields:
#' \itemize{
#' \item "genotype": The original genotype in format: \dQuote{03:FYKD/04:ADCGE}.
#' \item "zygosity": One of \sQuote{heterozygous} or \sQuote{homozygous}.
#' \item "conc": Randomly attributed DNA concentrations.
#' \item "bins": Intervals of DNA concentrations used to attribute
#' concentration-dependent errors.
#' \item "idx": \strong{key}; A running index.
#' }
#' \item "errors": table with the fields:
#' \itemize{
#' \item "genotype": The genotype after error injection.
#' \item "zygosity": The zygosity after error injection.
#' \item "idx": \strong{key}; A running index.
#' }
#' }
#' and the attributes:
#' \itemize{
#' \item "penv": The enclosing environment of the <geno_table> constructor
#' containing memoised mapping and remapping functions created by
#' \code{\link{make_mapper}} and a table of observed and expected genotype
#' frequencies generated by \code{\link{gtf_table}}.
#' \item "breaks": The cut points used to slice up the DNA concentration.
#' }
#' @name geno_table
#' @examples
#' \dontrun{
#' data(dpb1)
#' dpb1.pl <- dpb1[provenance == "PL"]
#'
#' ## Generate a distribution of DNA concentrations
#' conc <- sample_dna_concentration(dpb1.pl, n = 1000, ncores = 8)
#'
#' ## Generate a sampling function
#' data(dpb1_eag1412)
#' geno_table <- make_genotype_sampler(dpb1.pl, dpb1_eag1412)
#'
#' ## Access mapping and remapping functions as well as genotype frequencies
#' mapper(geno_table)
#' remapper(geno_table)
#' gtf_table(genotable)
#'
#' ## Sample genotypes
#' n <- 10000
#' bin_size <- 3
#' ans <- geno_table(conc, n, bin_size)
#' ans
#'
#' summary(ans)
#' samples(ans)
#' }
NULL
#' @export
print.geno_table <- function(x, n = 5, ...) {
cat("<geno_table> [samples: ", nrow(x$samples), "; errors: ", nrow(x$errors), "]\n\n", sep = "")
cat(" Samples: ", sep = "")
print(x$samples, n = n)
cat("\n Errors: ", sep = "")
print(x$errors, n = n)
}
#' @export
summary.geno_table <- function(object, ...) {
j <- samples(object)[errors(object)]
j[zygosity == "heterozygous", .(
pHom = sum(i.zygosity == "homozygous", na.rm = TRUE)/nrow(.SD),
pHet = sum(i.zygosity == "heterozygous", na.rm = TRUE)/nrow(.SD),
pHetIdent = sum(i.zygosity == "heterozygous" & genotype == i.genotype, na.rm = TRUE)/nrow(.SD),
pHetDiff = sum(i.zygosity == "heterozygous" & genotype != i.genotype, na.rm = TRUE)/nrow(.SD),
pNA = sum(is.na(i.zygosity))/nrow(.SD),
nerr = nrow(.SD)
)]
}
#' @export
merge.geno_table <- function(x, y = NULL) {
samples(x)[errors(x)] %>%
dplyr::select(genotype, i.genotype, zygosity, i.zygosity, eag_status, i.eag_status, conc, bins) %>%
dplyr::rename(genotype2 = i.genotype, zygosity2 = i.zygosity, eag_status2 = i.eag_status)
}
#' @export
samples.geno_table <- function(x) {
x$samples
}
#' @export
errors.geno_table <- function(x) {
x$errors
}
#' @export
gtf_table.geno_table <- function(x) {
get("gtfrq", envir = attr(x, "penv"))
}
#' @export
gtf_table.genotype_sampler <- function(x) {
get("gtfrq", envir = environment(x))
}
#' @export
mapper.geno_table <- function(x) {
get("map", envir = attr(x, "penv"))
}
#' @export
mapper.genotype_sampler <- function(x) {
get("map", envir = environment(x))
}
#' @export
remapper.geno_table <- function(x) {
get("remap", envir = attr(x, "penv"))
}
#' @export
remapper.genotype_sampler <- function(x) {
get("remap", envir = environment(x))
}
slice_bins <- function(conc, bin_size) {
if (is(conc, "data.table")) {
assertive::assert_is_superset(names(conc), "conc")
conc <- conc[, conc]
}
conc_max <- ceiling(max(conc))
unique(c(seq(0, conc_max, bin_size), conc_max))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.