#' Determine the optimal error tolerance to QC clonal simple CNAs.
#'
#' @description
#'
#' The QC procedure implemented by function \code{analyze_peaks} allows to
#' pass or fail a sample based on a custom purity error (maximum error to allow).
#'
#' A regression has been used to measure the rate of false positives
#' from simulated tumours with variable coverage and purity. This allows to determine
#' an optimal value of $\eqn{\epsilon}$, the parameter `purity_error` of function
#' \code{analyze_peaks}, for a desired rate of alse positives basde on the sample
#' coverage and putative purity.
#'
#' @param purity The data purity (putative) for a sample.
#' @param coverage The observed data coverage.
#' @param fpr Desired false positive rate.
#' @param epsilon_range Range of values to constrain $epsilon$.
#'
#' @return The $\eqn{\epsilon}$ value estimated from data, constrained to be in
#' `epsilon_range`, in order to limit the false positive rate to be at most `fpr`.
#'
#' @export
#'
#' @examples
#' # Desired 10% FPR for a 30% pure tumour at 90x
#' auto_tolerance(.3, 90)
auto_tolerance = function(
purity,
coverage,
fpr = 0.1,
epsilon_range = c(0.01, 0.08)
)
{
fpr_test = CNAqc::fpr_test %>% filter(purity != .60)
fake_ret = list(
epsilon_range = 0.01,
training_plot = ggplot(),
extrapolation_plot = ggplot()
)
# Outside training set
if(fpr_test$coverage %>% max < coverage)
{
cli::cli_alert_danger("Coverage {.field {coverage}} is unsupported - maximum is {.value {fpr_test$coverage %>% max}} - returning 0.01 (default)")
return(fake_ret)
}
if(fpr_test$coverage %>% min > coverage)
{
cli::cli_alert_danger("Coverage {.field {coverage}} is unsupported - minimum is {.value {fpr_test$coverage %>% min}} - returning 0.01 (default)")
return(fake_ret)
}
if(fpr_test$purity %>% max < purity)
{
cli::cli_alert_danger("Purity {.field {purity}} is unsupported - maximum is {.value {fpr_test$purity %>% max}} - returning 0.01 (default)")
return(fake_ret)
}
if(fpr_test$purity %>% min > purity)
{
cli::cli_alert_danger("Purity {.field {purity}} is unsupported - minimum is {.value {fpr_test$purity %>% min}} - returning 0.01 (default)")
return(fake_ret)
}
if(fpr >= 1 | fpr <= 0)
{
cli::cli_alert_danger("FPR outside [0,1] does not make sense - returning 0.01 (default)")
return(fake_ret)
}
# We can work and regress the input
fpr_test = fpr_test %>%
dplyr::mutate(key = paste0(coverage, ":", purity))
# fpr_test %>%
# filter(coverage == 60) %>%
# ggplot(aes(x=epsilon_tolerance, y = FPR, color = purity %>% paste())) +
# geom_point() +
# geom_smooth(method = 'lm')
# Stat smooth data - create linear functions
ggp_data = fpr_test %>%
dplyr::group_split(key) %>%
lapply(function(w)
{
fit <-
glm(FPR ~ epsilon_tolerance,
data = w %>% dplyr::select(epsilon_tolerance, FPR))
# y = mx + q
q = fit$coefficients[1] %>% as.numeric()
m = fit$coefficients[2] %>% as.numeric()
# plot(w$epsilon_tolerance, w$FPR)
# points(w$epsilon_tolerance, m * (w$epsilon_tolerance) + q, col = 'red', pch = 11)
# x = (y - q)/m
inv_fun = function(FPR) {
eval = (FPR - q) / m
if (eval < epsilon_range[1]) {
warning(
"Tolerance below 1% for FPR = ",
FPR,
" - returning ",
epsilon_range[1],
" (epsilon_range)."
)
return(epsilon_range[1])
}
if (eval > epsilon_range[2]) {
warning(
"Tolerance above 100% for FPR = ",
FPR,
" - returning ",
epsilon_range[1],
" (epsilon_range)."
)
return(epsilon_range[2])
}
eval
}
w[1,] %>%
dplyr::select(coverage, purity) %>%
dplyr::mutate(epsilon = inv_fun(fpr))
})
regression_data = Reduce(f = bind_rows, ggp_data)
# names(ggp_data) = fpr_test$key %>% unique
# Evaluate regression test - inverse function
cli::cli_alert("Inverting training from regression; requiring epsilon in range {.field [{epsilon_range[1]} - {epsilon_range[2]}]}.")
# regression_data = tidyr::expand_grid(coverage = fpr_test$coverage %>% unique,
# purity = fpr_test$purity %>% unique) %>%
# mutate(fun = ggp_data[paste0(coverage, ':', purity)])
#
# regression_data$fun_x = sapply(regression_data$fun, function(f)
# f(fpr) %>% round(4))
# Tile plot
reg_plot2 = regression_data %>%
dplyr::mutate(coverage = factor(
coverage,
levels = gtools::mixedsort(regression_data$coverage %>% unique)
)) %>%
mutate(purity = factor(
purity,
levels = gtools::mixedsort(regression_data$purity %>% unique)
)) %>%
ggplot2::ggplot() +
ggplot2::geom_tile(aes(
x = coverage,
y = purity,
fill = epsilon,
width = 0.9,
height = 0.9
)) +
CNAqc:::my_ggplot_theme() +
ggplot2::scale_fill_gradientn(colours = c('indianred3', 'forestgreen', 'steelblue')) +
# scale_fill_viridis_c(option = 'C', limits = c(0, NA)) +
ggplot2::guides(fill = ggplot2::guide_colorbar(bquote("Regressed " * epsilon * " "), barwidth = unit(3, 'cm'))) +
ggplot2::labs(title = bquote("Regressed " * epsilon * ' for FPR < ' * .(fpr)))
# 2D interpolation
cli::cli_alert("Generating interpolated 2D regression map.")
purity_x = seq(min(fpr_test$purity), max(fpr_test$purity), 0.01)
coverage_x = seq(min(fpr_test$coverage), max(fpr_test$coverage), 1)
plot_grid = tidyr::expand_grid(coverage = coverage_x,
purity = purity_x) %>%
rowwise() %>%
mutate(
fit =
akima::interp(
regression_data$purity,
regression_data$coverage,
regression_data$epsilon,
xo = purity,
yo = coverage,
linear = TRUE,
extrap = TRUE
)$z %>% as.numeric
)
# Shape data, apply cutoff
# cli::cli_alert("Generating interpolated regression map.")
plot_grid = plot_grid %>%
filter(!is.na(fit)) %>%
mutate(coverage = factor(coverage,
levels = gtools::mixedsort(plot_grid$coverage %>% unique))) %>%
mutate(purity = factor(purity,
levels = gtools::mixedsort(plot_grid$purity %>% unique))) %>%
mutate(fit = ifelse(fit > epsilon_range[2], epsilon_range[2], fit)) %>%
mutate(fit = ifelse(fit < epsilon_range[1], epsilon_range[1], fit)) %>%
ggplot2::ggplot() +
ggplot2::geom_tile(ggplot2::aes(x = coverage, y = purity, fill = fit)) +
ggplot2::scale_y_discrete(breaks = regression_data$purity %>% unique) +
CNAqc:::my_ggplot_theme() +
# scale_fill_viridis_c(option = 'C', limits = c(0, NA)) +
ggplot2::guides(fill = ggplot2::guide_colorbar(bquote("Extrapolated " * .(epsilon_range[1]) * "<" * epsilon * "<" * .(epsilon_range[2]) *" "), barwidth = unit(3, 'cm'))) +
ggplot2::labs(title = bquote("Extrapolated " * epsilon * ' map')) +
ggplot2::scale_fill_gradientn(colours = c('indianred3', 'forestgreen', 'steelblue')) +
ggplot2::scale_x_discrete(breaks = regression_data$coverage %>% unique)
# Point_estimate
point_estimate = akima::interp(
regression_data$purity,
regression_data$coverage,
regression_data$epsilon,
xo = purity,
yo = coverage,
linear = TRUE,
extrap = TRUE
)$z %>% as.numeric
if(point_estimate < epsilon_range[1]) point_estimate = epsilon_range[1]
if(point_estimate > epsilon_range[2]) point_estimate = epsilon_range[2]
p_est = data.frame(coverage = coverage, purity = purity) %>%
mutate(coverage = factor(coverage,
levels = gtools::mixedsort(coverage_x))) %>%
mutate(purity = factor(purity,
levels = gtools::mixedsort(purity_x)))
plot_grid = plot_grid +
ggplot2::geom_point(
data = p_est,
ggplot2::aes(x = coverage, y = purity),
fill = 'black',
size = 2,
pch = 23
)
cli::cli_alert_success("Suggested point estimate: {.field \u03B5 = {point_estimate}}.")
return(
list(
epsilon_range = point_estimate,
training_plot = reg_plot2,
extrapolation_plot = plot_grid
)
)
}
I
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.