#' Calculate error model
#'
#' Calculate error model for ref/ref and alt/alt alleles globally across
#' all positions that should have no trace of contamination.
#'
#' @param dat data.frame containing counts and metrics per SNP and only hom
#' alleles (0/0 and 1/1). Het alleles should have already been removed.
#' @param save.dir character location to save the results
#' @param default_error_rate if absolutely no errors are observed in the data,
#' use this number as the error rate
#' @param filename_prefix file name prefix
#' @param context_mode whether to run in context mode
#'
#' @return data.frame with error model
#'
#' @importFrom utils write.table
#' @export
calculate_error_model <- function(dat, save.dir = NA, filename_prefix = NA,
default_error_rate = 1e-5,
context_mode = FALSE) {
# Prepare error model data frame
bases <- get_bases(context_mode)
EE <- data.frame(ref = rep(bases, length(bases)),
subs = rep(bases, each = length(bases)),
er = 0, reads = 0, denom = 0)
rownames(EE) <- paste(EE$ref, EE$subs, sep = ">")
EE$ref <- as.character(EE$ref)
# Rebuild EE for context from only compatible base changes
for (i in bases) {
for (j in bases) {
# Skip mismatching contexts in the context mode
if (context_mode && !is_compatible_context(i, j)) {
subs <- paste(i, j, sep = ">")
EE <- EE[-which(row.names(EE) == subs), ]
}
}
}
# Calculate number of reads observed for each substitution
for (i in bases) {
for (j in bases) {
# Skip mismatching contexts in the context mode
if (context_mode && !is_compatible_context(i, j))
next
subs <- paste(i, j, sep = ">")
# Look at non-SNP base substitutions (major=i and minor!=j) and count the
# occurence of each event i>j in the context of the genotype
if (!context_mode) {
EE[subs, ]$reads <-
sum(dat %>%
dplyr::filter(gt == "0/0", major == i,
minor != j) %>%
dplyr::pull(j)) +
sum(dat %>%
dplyr::filter(gt == "1/1", minor == i,
major != j) %>%
dplyr::pull(j))
} else {
# Context reference and snp are already updated based on genotype
# See update_context() function
j2 <- substring(j, 2, 2)
EE[subs, ]$reads <- sum(dat %>%
dplyr::filter(gt != "0/1", context == i,
context_snp != j) %>%
dplyr::pull(j2))
}
}
}
# Calculate denominator and error rate for each substitution
for (i in bases) {
for (j in bases) {
# Skip mismatching contexts in the context mode
if (context_mode && !is_compatible_context(i, j))
next
subs <- paste(i, j, sep = ">")
# Denominator when calculating error rate on SNP position
# is over-counted, therefore it needs to be corrected.
# For example, to calculate C>T error rate, we count the number of
# times the substitution C>T is observed on C>A and C>G SNPs. We cannot
# use C>T SNPs for this purpose due to the possibility of contamination.
# But, when calculating the denominator (how many times the C base is
# observed), we calculate all C>C events that come from all 3 types of
# SNPs described above. Thus we need to reduce this denominator to two
# thirds. One caveat is that, Non-SNP alleles (maf = 0), should not
# receive this correction since they are not supposed to have any
# contamination and can be used to look at all types of error rates.
# In other terms, denom_factor will be 0.66 if all bases in dat are
# from dbSNP, otherwise it will be a larger number closer to 1.
denom_factor <- 2/3 * mean(dat$maf > 0) + mean(dat$maf == 0)
# Denominator in error model is all the times ref base was the original
# base. For example, denom for A>A, A>T, A>G, and A>C are all the same,
# and some of the reads representing these cases
EE[subs, ]$denom <- round(denom_factor * sum(EE[EE$ref == i, ]$reads))
# If no errors are observed, set error rate as NA if there are less than
# 1000 reference observations, otherwise set it as 1 / observations.
# If any number of errors are observed, set it as errors / observations.
EE[subs, ]$er <- ifelse(EE[subs, ]$reads == 0,
ifelse(EE[subs, ]$denom < 1000, NA,
1 / EE[subs, ]$denom),
EE[subs, ]$reads / EE[subs, ]$denom)
}
}
# Remove ref->ref rates, they were only calculated for the denominator
EE <- EE[EE$ref != EE$subs, ]
# If there are missing error rates, set them as global average
# First, handle edge cases if there are no errors
if (sum(is.na(EE$er)) == nrow(EE) | mean(EE$er, na.rm = TRUE) == 0) {
EE$er <- default_error_rate
} else if (sum(is.na(EE$er)) > 0) {
EE[is.na(EE$er), ]$er <- mean(EE$er, na.rm = TRUE)
}
# Write error model to file if save.dir and filename_prefix are specified
if (!is.na(save.dir) && !is.na(filename_prefix)) {
error.file <- file.path(save.dir, paste(filename_prefix, "error.tsv", sep = "."))
write.table(format(EE, digits = 5, trim = TRUE), file = error.file,
sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}
return(EE)
}
#' Add error rates for each base based on ref/ref and alt/alt error rates
#'
#' @param dat data.table containing counts and metrics per SNP
#' @param EE data.frame errror model
#' @param context_mode
#'
#' @return data.frame with error model
#'
#' @export
add_error_rates <- function(dat, EE, context_mode = FALSE) {
# Substitution type
dat$et = ""
if (context_mode) {
dat <- dat %>%
dplyr::mutate(et = paste(context, ">", context_snp, sep = ""))
} else {
dat <- dat %>%
dplyr::mutate(et = dplyr::case_when(
gt == "0/0" ~ paste(major, ">", minor, sep = ""),
gt == "0/1" ~ paste(major, ">", minor, sep = ""),
gt == "1/1" ~ paste(minor, ">", major, sep = "")))
}
# Error rate corresponding to that substitution type
EE$et <- rownames(EE)
dat <- dat %>%
dplyr::left_join(EE %>% dplyr::select(et, er), by = "et")
return(dat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.