#' Inject errors into a sample of genotypes.
#'
#' Takes a \code{\link{geno_table}} object and injects a total of
#' \emph{perr} errors with a concentration-dependent rate described
#' by a cumulative distribution functions, \emph{cdf}, using the
#' probability calculation described in \code{\link{loose_eag}}..
#'
#' @note The next step is to map genotype frequencies onto the sampled
#' genotypes: \code{\link{map_genotype_frequencies}}.
#' @param x A \code{\link{geno_table}} object with attribute \code{has_errors = FALSE}.
#' @param cdf A cumulative distribution function describing a DNA-concentration-
#' dependent error rate. See \code{\link{cdfA}} and \code{\link{cdfB}} for two
#' precomputed error functions based on HLA-A and HLA-B.
#' @param perr The overall error rate.
#' @param odds The odds of loosing an EAG on exon 2 or exon 3, respectively.
#' E.g. \code{odds = 2} means a 2:1 chance that the loss occurs on Exon 2;
#' \code{odds = 0.25} means a 1:4 chance that the loss occurs on Exon 3.
#'
#' @return A \code{\link{geno_table}} object with attribute \code{has_errors = TRUE}.
#' @family simulation functions
#' @export
#' @examples
#' \dontrun{
#' ## Extract HLA-DPB1 genotype frequencies
#' dpb1 <- HLA("DPB1", "01/01/2014", "23/03/2015")
#'
#' ## Restrict the data to the German sample
#' dpb1.de <- dpb1[provenance == "DE"]
#'
#' ## Generate an EAG table
#' dpb1_eag1412 <- eag_table(gene = "DPB1", nextype_basis_id = "1412")
#'
#' ## Generate a distribution of DNA concentrations
#' conc <- sample_dna_concentration(dpb1.de, n = 1000, ncores = 8)
#'
#' ## Generate a sampling function
#' sample_dpb1_de <- make_genotype_sampler(dpb1.de, eag_1267)
#'
#' ## Load precomputed DNA-concentration-dependent error distribution for HLA-A
#' data(cdfA)
#'
#' ## Sample genotypes
#' n <- 10000
#' bin_size <- 3
#' perr <- 0.01
#' odds <- 0.25
#' ans <- sample_dpb1_de(conc, n, bin_size) %>%
#' inject_errors(cdfA, perr, odds)
#' ans
#'
#' summary(ans)
#' samples(ans)
#' errors(ans)
#' merge(ans)
#' }
inject_errors <- function(x, cdf, perr, odds = 1) {
assertive::assert_is_any_of(x, "geno_table")
if (nrow(errors(x)) > 0) {
warning("This <geno_table> has already errors injected", immediate. = TRUE)
return(x)
}
`%do%` <- foreach::`%do%`
iter <- iterators::iter
foreach <- foreach::foreach
## Fetch mappers from the enclosing environment of the
## <geno_table> constructor. Make sure they get re-assigned
## after use to take full advantage of the memoisation.
mapargs <- list(
map = get("map", envir = attr(x, "penv")),
remap = get("remap", envir = attr(x, "penv")),
odds = odds
)
on.exit(assign("remap", mapargs$remap, envir = attr(x, "penv")))
on.exit(assign("map", mapargs$map, envir = attr(x, "penv")), add = TRUE)
## Number of samples
n <- nrow(samples(x))
## Total number of mistyped samples
nerr <- ceiling(n*perr)
## Conditional probability of an error at some slice of DNA
## concentrations given an error occurred
cond_perr <- diff(cdf(attr(x, "breaks")))
cond_perr <- setNames(object = cond_perr, nm = levels(samples(x)[, bins]))
cond_perr <- cond_perr[cond_perr > 0]
## Compute the number of errors per slice of DNA concentration
sampling_size <- round(nerr*cond_perr, digits = 0)
## Sample heterozygotes that will be homogenised, merge them with the true
## homozygotes and join the resulting table with the genotype frequencies.
iSS <- iter(sampling_size)
iSN <- iter(names(sampling_size))
samples <- samples(x)
#ss <- nextElem(iSS)
#sn <- nextElem(iSN)
rs <- foreach(ss = iSS, sn = iSN, .combine = "rbind") %do% {
dt <- samples[bins == sn]
if ((nr <- nrow(dt)) < ss) {
fmt <- paste0(
"\n The number of errors (%s) in bin %s is larger",
" than the number of available samples (%s).\n",
"Try decreasing the overall error rate (perr = %s)")
msg <- sprintf(gsub(" +", " ", trimws(fmt)), ss, sn, nrow(dt), perr)
warning(msg, call. = TRUE)
ss <- nrow(dt)
}
dt[sample.int(nr, size = ss, replace = FALSE)]
}
setkeyv(rs, "bins")
rs2 <- data.table(pE = cond_perr, bins = names(cond_perr), key = "bins")
dt <- rs[rs2, allow.cartesian = TRUE]
dt <- dt[!is.na(genotype)]
dt[, c("genotype", "eag_status") := do_inject(dt, mapargs)]
dt[, zygosity := ifelse(is_homozygous(genotype), "homozygous", "heterozygous")]
dt[, genotype := ifelse(genotype == "NA/NA", NA_character_, genotype)]
dt[, zygosity := ifelse(is.na(genotype), NA_character_, zygosity)]
dt[, conc := NULL]
dt[, bins := NULL]
dt[, pE := NULL]
setkeyv(dt, "idx")
x$errors <- dtplyr::tbl_dt(dt)
x
}
EA <- function(E, O) {
0.5*(1 + O - O*sqrt((1 + 1/O)^2 - (4*E)/O))
}
#' Loose an EAG
#'
#' Randomly loose an EAG number.
#'
#' @details
#' We generally deal with the following scenario on any HLA locus currently:
#'
#' \tabular{rcc}{
#' \tab Exon A \tab Exon B \cr
#' EAG 1 \tab a \tab c \cr
#' EAG 2 \tab b \tab d
#' }
#'
#' We know (have a guesstimate of) the total error rate due to loss of EAGs
#' at a given DNA concentration, \eqn{P(E)}, and we know the odds, \eqn{\omega},
#' that we'll loose an EAG at one or the other exon.
#'
#' E.g. if we perform two independent PCR reactions for exon 2 and only one
#' PCR reaction for exon 3, the odds of loosing an EAG are 4 in 1 for exon
#' 3. I. e. in order for an error to manifest on exon 2 we must loose the
#' same EAG in both reactions, the probability of which is \out{¼}
#' of loosing an EAG in a single reaction.
#'
#' What we want to know is:
#' \itemize{
#' \item \out{P(A ∩ ¬ B | E)}
#' The probability that we loose an EAG on exon A but not on exon B,
#' given that we make an error at all.
#' \item \out{P(¬ A ∩ B | E)}
#' The probability that we loose an EAG on exon B but not on exon A,
#' given that we make an error at all.
#' \item \out{P(A ∩ B | E)}
#' The probability that we loose an EAG on exon A and on exon B,
#' given that we make an error at all.
#' }
#'
#' To get there we need to know the error rate for exon A, \out{ε<sub>A</sub>},
#' and the error rate for exon B, \out{ε<sub>B</sub>}, since:
#' \itemize{
#' \item \out{P(A ∩ ¬ B; | E) = ε<sub>A</sub>(1 - ε<sub>B</sub>)
#' ⁄
#' (ε<sub>A</sub> + ε<sub>B</sub> -
#' ε<sub>A</sub>ε<sub>B</sub>)}
#' \item \out{P(¬ A ∩ B; | E) = ε<sub>B</sub>(1 - ε<sub>A</sub>)
#' ⁄
#' (ε<sub>A</sub> + ε<sub>B</sub> -
#' ε<sub>A</sub>ε<sub>B</sub>)}
#' \item \out{P(¬ A ∩ B; | E) = ε<sub>B</sub>ε<sub>A</sub>
#' ⁄
#' (ε<sub>A</sub> + ε<sub>B</sub> -
#' ε<sub>A</sub>ε<sub>B</sub>)}
#' }
#'
#' Considering that \out{ε<sub>A</sub> + ε<sub>B</sub> -
#' ε<sub>A</sub>ε<sub>B</sub> = P(E)} and
#' \out{ε<sub>B</sub> = <sup>1</sup>⁄<sub>ω</sub>
#' ε<sub>A</sub>} we can arrive after some rearrangements at:
#'
#' \itemize{
#' \item \out{ε<sub>A</sub> = ½(1 + ω - ω
#' √<span style="text-decoration: overline">
#' (1 + 1⁄ω)<sup>2</sup> -
#' 4<em>P(E)</em>⁄ω</span>)}
#' \item \out{ε<sub>B</sub> = <sup>1</sup>⁄<sub>ω</sub>
#' ε<sub>A</sub>}
#' }
#'
#' These error rates allow now to randomly loose EAGs with the
#' correct probabilities.
#'
#' @param x A <\code{\link{eag_numbers}}> instance.
#' @param pE The absolute probability that an error occurs.
#' @param odds The odds in favour of loosing an EAG on Exon 2 rather
#' than Exon 3. E.g. \code{odds = 2} means a 2:1 chance that the loss
#' occurs on Exon 2; \code{odds = 0.25} means a 1:4 chance that
#' the loss occurs on Exon 3.
#' @return An <eag_numbers> instance
#' @keywords internal
loose_eag <- function(x, pE, odds = 1) {
ea <- EA(pE, odds)
eb <- (1/odds)*ea
P_a <- ea*(1 - eb)/pE
P_b <- eb*(1 - ea)/pE
P_ab <- (ea*eb)/pE
## Cast the die to determine if we are going to sample both exons
exons <- sample.int(2L, 1L, replace = FALSE, prob = c(1 - P_ab, P_ab))
if (exons == 2L) {
## Sample one EAG from each exon to loose and keep the other one
loose <- sample.int(2L, 2L, replace = TRUE, prob = c(0.5, 0.5))
keep <- ifelse(loose == 1, 2, 1)
loose[2] <- loose[2] + 2
keep[2] <- keep[2] + 2
}
else {
## Sample one of the four EAGs to loose and keep the other one
loose <- sample.int(4L, 1L, replace = FALSE, prob = c(P_a/2, P_a/2, P_b/2, P_b/2))
keep <- switch(loose, 2, 1, 4, 3)
}
x[loose] <- x[keep]
x
}
#' How many eags do we find on exons 2 and 3?
#'
#' 11 <-> hom:hom
#' 12 <-> hom:het
#' 21 <-> het:hom
#' 22 <-> het:het
#'
#' @param nums Eag number
#' @keywords internal
eag_status <- function(nums) {
paste0(nunique(nums[1:2]), nunique(nums[3:4]))
}
#' Inject errors
#'
#' @param dt A data.table with genotypes and error probabilities
#' @param args A list containing a mapper function <map>, a
#' remapper function <remap>, and <odds>.
#' @keywords internal
do_inject <- function(dt, args) {
dots <- list(a = string2allele(dt$genotype), p = dt$pE)
data.table::rbindlist(
.mapply(inject, dots = dots, MoreArgs = args)
)
}
inject <- function(a, p, odds, map, remap) {
nums <- loose_eag(remap(a), pE = p, odds = odds)
list(slash(map(nums)), eag_status(nums))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.