#------------------------------------------------
#' Process data
#'
#' Helper function to process data
#'
#' The function computes whether a SNP is a variant site or not, based on the
#' simulated WSMAF at that SNP. This process additionally accounts for potential
#' sequencing error.
#'
#' @inheritParams process_real
#'
#' @return A list of the following:
#' * `data`: A tibble with
#' + `plmaf_cut`: Breaks of the form `[a, b)`.
#' + `m_variant`: The average WSMAF or proportion of variant sites in each
#' segment defined by `plmaf_cut`.
#' + `bucket_size`: The number of loci in each bucket.
#' + `midpoints`: The midpoint of each bucket.
#' * `seq_error`: The sequence error inferred.
#' * `bin_size`: The minimum size of each bin.
#' * `cuts`: The breaks utilized in splitting the data.
#' of each COI.
#' @keywords internal
process <- function(wsmaf,
plmaf,
coverage,
seq_error = 0.01,
bin_size = 20,
coi_method = "variant") {
# Infer value of seq_error if NULL
if (is.null(seq_error)) {
seq_error <- estimate_seq_error(wsmaf, plmaf, bin_size)
}
if (coi_method == "variant") {
# Isolate PLMAF, determine the PLMAF cuts, and whether a site is a variant,
# accounting for sequence error
df <- data.frame(
plmaf_cut = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size)),
variant = ifelse(wsmaf <= seq_error | wsmaf >= (1 - seq_error), 0, 1),
coverage = coverage
)
} else if (coi_method == "frequency") {
# Subset to heterozygous sites
data <- data.frame(wsmaf = wsmaf, plmaf = plmaf, coverage = coverage) %>%
dplyr::filter(wsmaf > seq_error & wsmaf < (1 - seq_error))
wsmaf <- data$wsmaf
plmaf <- data$plmaf
coverage <- data$coverage
# If remove all data, need to return a pseudo result to not induce errors.
# Additionally, in order to define a cut, need at least 2 data points
if (length(plmaf) <= 1) {
vec <- stats::setNames(
rep("", 4),
c("plmaf_cut", "m_variant", "bucket_size", "midpoints")
)
df_grouped <- dplyr::bind_rows(vec)[0, ]
res <- list(
data = df_grouped,
seq_error = seq_error,
bin_size = bin_size,
cuts = NULL
)
return(res)
}
# Isolate PLMAF, and keep WSMAF as is
df <- data.frame(
plmaf_cut = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size)),
variant = wsmaf,
coverage = coverage
)
}
# Average over intervals of PLMAF
df_grouped <- df %>%
dplyr::group_by(.data$plmaf_cut, .drop = FALSE) %>%
dplyr::summarise(
m_variant = stats::weighted.mean(.data$variant, .data$coverage),
bucket_size = dplyr::n()
) %>%
stats::na.omit()
# Compute midpoints and set coverage to be uniform across each bucket
df_grouped_mid <- find_cut_midpoints(df_grouped, .data$plmaf_cut) %>%
tibble::add_column(coverage = rep(100, nrow(.)))
# Return data, seq_error, and cuts
list(
data = df_grouped_mid,
seq_error = seq_error,
bin_size = bin_size,
cuts = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size, onlycuts = TRUE))
)
}
find_cut_midpoints <- function(data, cuts) {
# Convert single cuts to the standard format: "[lower,upper)"
fix_single_cuts <- dplyr::mutate(
data,
fixed_cuts = as.character({{ cuts }}),
fixed_cuts = ifelse(
!stringr::str_starts(.data$fixed_cuts, "\\["),
glue::glue("[{.data$fixed_cuts},{.data$fixed_cuts})"),
.data$fixed_cuts
)
)
# Find lower and upper bounds
extract_bounds <- tidyr::extract(
data = fix_single_cuts,
col = .data$fixed_cuts,
into = c("lower", "upper"),
regex = "([[:alnum:]].+),([[:alnum:]].+)[\\]\\)]",
convert = TRUE
)
# Determine cut midpoints
dplyr::mutate(
extract_bounds,
midpoints = (.data$upper + .data$lower) / 2,
.keep = "unused"
)
}
#------------------------------------------------
#' Process simulated data
#'
#' Generate the simulated COI curve.
#'
#' Utilize the output of [sim_biallelic()], which creates simulated
#' data. The PLMAF is kept, and the function computes whether a SNP is a
#' variant site or not, based on the simulated WSMAF at that SNP. This process
#' additionally accounts for potential sequencing error. To check whether the
#' simulated WSMAF correctly indicated a variant site or not, the phased
#' haplotype of the parasites is computed.
#'
#' @param sim Output of [sim_biallelic()].
#' @param seq_error The level of sequencing error that is assumed. If no value
#' is inputted, then we infer the level of sequence error.
#' @param bin_size `r lifecycle::badge("deprecated")` This argument is no longer
#' supported; to estimate the COI, all data points are used. Data points are
#' not grouped in bins of changing `plaf`.
#' @inheritParams theoretical_coi
#'
#' @return A list of the following:
#' * `data`: A tibble with
#' + `plmaf_cut`: Breaks of the form `[a, b)`.
#' + `m_variant`: The average WSMAF or proportion of variant sites in each
#' segment defined by `plmaf_cut`.
#' + `bucket_size`: The number of loci in each bucket.
#' + `midpoints`: The midpoint of each bucket.
#' * `seq_error`: The sequence error inferred.
#' * `bin_size`: The minimum size of each bin.
#' * `cuts`: The breaks utilized in splitting the data.
#' of each COI.
#' @family simulated data functions
#' @seealso [process_real()] to process real data.
#' @export
process_sim <- function(sim,
seq_error = 0.01,
bin_size = 20,
coi_method = "variant") {
# Check inputs
if (!is.null(seq_error)) assert_single_bounded(seq_error)
assert_single_pos_int(bin_size)
# Run helper to process
process(
wsmaf = sim$data$wsmaf,
plmaf = sim$data$plmaf,
coverage = sim$data$coverage,
seq_error = seq_error,
bin_size = bin_size,
coi_method = coi_method
)
}
#------------------------------------------------
#' Process real data
#'
#' Generate the COI curve for real data.
#'
#' The function computes whether a SNP is a variant site or not, based on the
#' WSMAF at that SNP. This process additionally accounts for potential
#' sequencing error.
#'
#' @param wsmaf The within-sample minor allele frequency.
#' @param plmaf The population-level minor allele frequency.
#' @param coverage The read coverage at each locus.
#' @inheritParams process_sim
#'
#' @return A list of the following:
#' * `data`: A tibble with
#' + `plmaf_cut`: Breaks of the form `[a, b)`.
#' + `m_variant`: The average WSMAF or proportion of variant sites in each
#' segment defined by `plmaf_cut`.
#' + `bucket_size`: The number of loci in each bucket.
#' + `midpoints`: The midpoint of each bucket.
#' * `seq_error`: The sequence error inferred.
#' * `bin_size`: The minimum size of each bin.
#' * `cuts`: The breaks utilized in splitting the data.
#' of each COI.
#' @family real data functions
#' @seealso [process_sim()] to process simulated data.
#' @export
process_real <- function(wsmaf,
plmaf,
coverage,
seq_error = 0.01,
bin_size = 20,
coi_method = "variant") {
# Check inputs
assert_vector(wsmaf)
assert_bounded(wsmaf)
assert_vector(plmaf)
assert_bounded(plmaf)
if (!is.null(seq_error)) assert_single_bounded(seq_error)
assert_single_pos_int(bin_size)
input <- tibble::tibble(wsmaf = wsmaf, plmaf = plmaf, coverage = coverage) %>%
tidyr::drop_na()
# In some cases we are fed in the major allele so we ensure we only examine
# the minor allele. If the PLAF is > 0.5, we know it is the major allele so
# we look at 1 - WSAF and 1 - PLAF.
minor <- input %>%
dplyr::mutate(
wsmaf = ifelse(plmaf > 0.5, 1 - wsmaf, wsmaf),
plmaf = ifelse(plmaf > 0.5, 1 - plmaf, plmaf)
)
assert_bounded(minor$plmaf, left = 0, right = 0.5)
# Run helper to process
process(
wsmaf = minor$wsmaf,
plmaf = minor$plmaf,
coverage = minor$coverage,
seq_error = seq_error,
bin_size = bin_size,
coi_method = coi_method
)
}
#' @noRd
estimate_seq_error <- function(wsmaf, plmaf, bin_size) {
# Cut the data. We define error break to allow for flexible bucket sizes.
error_break <- 0.02
bins <- cut(plmaf, seq(0, 0.5, error_break), include.lowest = TRUE)
# We want to ensure that we have at least bin_size points in the first bin
while (table(bins)[1] < bin_size) {
error_break <- error_break * 1.25
bins <- cut(plmaf, seq(0, 0.5, error_break), include.lowest = TRUE)
}
# Data points in the lowest bin that are likely sequence error
low_wsmafs <- wsmaf[which(bins == levels(bins)[1])]
error <- low_wsmafs[low_wsmafs > 0 & low_wsmafs < 0.5]
# If wanted to do mixture models, would fit something to error
# Expected number of points
expected <- round(length(low_wsmafs) * (error_break / 2), 4)
# Remove expected number of points from true points
error_dist <- utils::head(sort(error), -expected)
# Find 95% error
seq_error <- as.numeric(stats::quantile(error_dist, 0.95, na.rm = T))
# Ensure that seq_error is greater than 1%
seq_error <- round(max(seq_error, 0.01, na.rm = T), 4)
return(seq_error)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.