#' Call mutations in a bam-file
#'
#' @description This function evaluate the presence (calls) of individual mutations from a predefined list.
#' @inheritParams call_cancer
#' @param bam_file_path Path to .BAM-file
#' @param reference_path Path to reference genome e.g. FASTA-file.
#' @param ncores Number of processing cores
#' @param batch_size Number of positions to process at a time
#' @param log_file write log-file to this path
#'
#' @return A [data.frame()] with information about the individual mutation calls, including:
#' \describe{
#' \item{chr, genomic_pos}{The genomic position of the mutation.}
#' \item{ref, alt}{The reference and alternative allele.}
#' \item{EM_converged}{If the EM algorithm converged.}
#' \item{EM_steps, fpeval, objfeval}{Number of steps and function evaluations by the EM algorithm.}
#' \item{tf_est}{The estiamted tumor fraction (allele fraction).}
#' \item{tf_min, tf_max}{The confidence interval of \code{tf_est}.}
#' \item{exp_count}{The expected count of the alternative allele under the error (null) model.}
#' \item{count}{The count of the alternative allele.}
#' \item{coverage}{The coverage used by the model (only referenceredas with and alternative allele).}
#' \item{full_coverage}{The total coverage of the position (for reference).}
#' \item{obs_freq}{The observed frequency of the alternative allele.}
#' \item{ll_0, ll_A}{The value of the log-likelihood function under the null (tf=0) and alternative (tf>0) hypothesis.}
#' \item{Q_val, df, p_val}{The chisq test statistic, degrees of freedom and p-value of the statistical test.}
#' \item{mutation_detected}{Whether the mutation was detected at the supplied alpha level.}
#' }
#'
#' @seealso [call_mutations()], [call_cancer()], [train_dreams_model()]
#'
#' @import parallel
#' @import doParallel
#' @importFrom foreach %dopar%
#' @export
dreams_vc_parallel <- function(mutations_df, bam_file_path, reference_path, model,
alpha = 0.05, use_turboem = TRUE, calculate_confidence_intervals = FALSE,
batch_size = NULL, ncores = 1, log_file = NULL) {
if (nrow(mutations_df) == 0) {
return(data.frame())
}
if (nrow(mutations_df) < ncores) {
ncores <- max(nrow(mutations_df), 1)
}
serial_model <- keras::serialize_model(model)
mutations_df <- mutations_df %>%
dplyr::select(
"chr" = matches("chr|CHR|CHROM"),
"genomic_pos" = matches("pos|POS"),
"ref" = matches("ref|REF"),
"alt" = matches("alt|ALT|obs|OBS")
) %>%
mutate(idx = sort(row_number() %% ncores, decreasing = F))
index_list <- unique(mutations_df$idx)
cl <- makeCluster(ncores)
doParallel::registerDoParallel(cl)
mutation_calls <- foreach::foreach(
i = index_list,
.combine = rbind,
.packages = c("keras", "tensorflow", "parallel", "doParallel"),
.errorhandling = "pass",
.export = "dreams_vc"
) %dopar% {
unserial_model <- keras::unserialize_model(serial_model)
if (!is.null(log_file)) {
sink(paste0(log_file, "_", i))
}
mutations <- mutations_df %>%
dplyr::filter(idx == i) %>%
dplyr::select(-idx)
current_calls <- dreams_vc(
mutations_df = mutations,
bam_file_path = bam_file_path,
reference_path = reference_path,
model = unserial_model,
use_turboem = use_turboem,
batch_size = batch_size,
calculate_confidence_intervals = calculate_confidence_intervals,
alpha = alpha
)
current_calls
return(current_calls)
}
sink()
stopCluster(cl)
return(mutation_calls)
}
#' Call mutations in a bam-file
#'
#' @description This function evaluate the presence (calls) of individual mutations from a predefined list.
#' @inheritParams call_cancer
#' @param bam_file_path Path to .BAM-file
#' @param reference_path Path to reference genome e.g. FASTA-file.
#' @param batch_size Number of positions to process at a time
#' @return A [data.frame()] with information about the individual mutation calls, including:
#' \describe{
#' \item{chr, genomic_pos}{The genomic position of the mutation.}
#' \item{ref, alt}{The reference and alternative allele.}
#' \item{EM_converged}{If the EM algorithm converged.}
#' \item{EM_steps, fpeval, objfeval}{Number of steps and function evaluations by the EM algorithm.}
#' \item{tf_est}{The estiamted tumor fraction (allele fraction).}
#' \item{tf_min, tf_max}{The confidence interval of \code{tf_est}.}
#' \item{exp_count}{The expected count of the alternative allele under the error (null) model.}
#' \item{count}{The count of the alternative allele.}
#' \item{coverage}{The coverage used by the model (only referenceredas with and alternative allele).}
#' \item{full_coverage}{The total coverage of the position (for reference).}
#' \item{obs_freq}{The observed frequency of the alternative allele.}
#' \item{ll_0, ll_A}{The value of the log-likelihood function under the null (tf=0) and alternative (tf>0) hypothesis.}
#' \item{Q_val, df, p_val}{The chisq test statistic, degrees of freedom and p-value of the statistical test.}
#' \item{mutation_detected}{Whether the mutation was detected at the supplied alpha level.}
#' }
#'
#' @seealso [call_mutations()], [call_cancer()], [train_dreams_model()]
#'
#' @export
dreams_vc <- function(mutations_df, bam_file_path, reference_path, model,
alpha = 0.05, use_turboem = TRUE, calculate_confidence_intervals = FALSE,
batch_size = NULL) {
# Clean up mutations
mutations_df <- mutations_df %>%
select(
"chr" = matches("chr|CHR|CHROM"),
"genomic_pos" = matches("pos|POS"),
"ref" = matches("ref|REF"),
"alt" = matches("alt|ALT|obs|OBS")
)
# Stop if mutations do not have the expected columns
mutations_expected_columns <- c("chr", "genomic_pos", "ref", "alt")
if (!all(mutations_expected_columns %in% colnames(mutations_df))) {
stop("mutations_df should have the columns ['chr', genomic_pos', 'ref, 'alt']")
}
positions <- mutations_df %>%
select(.data$chr, .data$genomic_pos) %>%
distinct()
if (is.null(batch_size)) {
batch_size <- nrow(positions) + 1
}
position_batches <- positions %>% mutate(batch_idx = (row_number() %/% batch_size))
mutation_calls <- NULL
n_batches <- length(unique(position_batches$batch_idx))
print(paste0("Calling mutations in ", n_batches, " batches:"))
count <- 1
beta <- get_training_data_from_bam(bam_file_path, reference_path)$info$beta
for (batch in sort(unique(position_batches$batch_idx))) {
print(paste0("Calling batch ", count, "/", n_batches))
count <- count + 1
q <- position_batches %>% filter(batch_idx == batch)
# Get read positions
read_positions_df <- get_read_positions_from_BAM(
bam_file_path = bam_file_path,
chr = q$chr,
genomic_pos = q$genomic_pos,
reference_path
)
current_mutations <- mutations_df %>% filter(
.data$chr %in% q$chr,
.data$genomic_pos %in% q$genomic_pos
)
# Call mutations
calls <- call_mutations(
mutations_df = current_mutations,
read_positions_df = read_positions_df,
model = model,
beta = beta,
alpha = alpha,
use_turboem = use_turboem,
calculate_confidence_intervals = calculate_confidence_intervals
)
mutation_calls <- rbind(mutation_calls, calls)
}
return(mutation_calls)
}
#' Call mutations from read positions
#'
#' @description This function evaluate the presence (calls) of individual mutations from a predefined list.
#' @inheritParams call_cancer
#' @param batch_size Number of positions to process at a time
#'
#' @return A [data.frame()] with information about the individual mutation calls, including:
#' \describe{
#' \item{chr, genomic_pos}{The genomic position of the mutation.}
#' \item{ref, alt}{The reference and alternative allele.}
#' \item{EM_converged}{If the EM algorithm converged.}
#' \item{EM_steps, fpeval, objfeval}{Number of steps and function evaluations by the EM algorithm.}
#' \item{tf_est}{The estiamted tumor fraction (allele fraction).}
#' \item{tf_min, tf_max}{The confidence interval of \code{tf_est}.}
#' \item{exp_count}{The expected count of the alternative allele under the error (null) model.}
#' \item{count}{The count of the alternative allele.}
#' \item{coverage}{The coverage used by the model (only referenceredas with and alternative allele).}
#' \item{full_coverage}{The total coverage of the position (for reference).}
#' \item{obs_freq}{The observed frequency of the alternative allele.}
#' \item{ll_0, ll_A}{The value of the log-likelihood function under the null (tf=0) and alternative (tf>0) hypothesis.}
#' \item{Q_val, df, p_val}{The chisq test statistic, degrees of freedom and p-value of the statistical test.}
#' \item{mutation_detected}{Whether the mutation was detected at the supplied alpha level.}
#' }
#' @import foreach
#' @seealso [call_cancer()], [train_dreams_model()]
#'
#' @export
call_mutations <- function(mutations_df, read_positions_df, model, beta,
alpha = 0.05, use_turboem = TRUE, calculate_confidence_intervals = FALSE, batch_size = NULL) {
# If no mutations return empty result
if (nrow(mutations_df) == 0) {
return(data.frame())
}
if (nrow(read_positions_df) == 0) {
return(data.frame())
}
# Clean up mutations
mutations_df <- mutations_df %>%
select(
"chr" = matches("chr|CHR|CHROM"),
"genomic_pos" = matches("pos|POS"),
"ref" = matches("ref|REF"),
"alt" = matches("alt|ALT|obs|OBS")
)
# Stop if mutations do not have the expected columns
mutations_expected_columns <- c("chr", "genomic_pos", "ref", "alt")
if (!all(mutations_expected_columns %in% colnames(mutations_df))) {
stop("mutations_df should have the columns ['chr', genomic_pos', 'ref, 'alt']")
}
# Stop if reads do not have the expected columns
reads_expected_columns <- c("chr", "genomic_pos", "ref", "obs")
if (!all(reads_expected_columns %in% colnames(read_positions_df))) {
stop("read_positions_df should have the columns ['chr', genomic_pos', 'ref, 'obs']")
}
positions <- mutations_df %>%
select(.data$chr, .data$genomic_pos) %>%
distinct()
if (is.null(batch_size)) {
batch_size <- nrow(positions) + 1
}
position_batches <- positions %>% mutate(batch_idx = (row_number() %/% batch_size))
n_batches <- length(unique(position_batches$batch_idx))
print(paste0("Calling mutations in ", n_batches, " batches:"))
mutation_calls <- NULL
for (batch in sort(unique(position_batches$batch_idx))) {
q <- position_batches %>% filter(batch_idx == batch)
current_read_positions_df <- read_positions_df %>% filter(
.data$chr %in% q$chr,
.data$genomic_pos %in% q$genomic_pos
)
if (nrow(current_read_positions_df) == 0) {
return(data.frame())
}
current_mutations <- mutations_df %>% filter(
.data$chr %in% q$chr,
.data$genomic_pos %in% q$genomic_pos
)
# Prepare EM input
em_input <- prepare_em_input(mutations_df = current_mutations, read_positions_df = current_read_positions_df, model = model, beta = beta)
obs_is_mut_list <- em_input$obs_is_mut_list
error_ref_to_mut_list <- em_input$error_ref_to_mut_list
error_mut_to_ref_list <- em_input$error_mut_to_ref_list
# Add full coverage to mutations
full_coverage_df <- current_read_positions_df %>%
count(.data$chr, .data$genomic_pos, name = "full_coverage")
current_mutations_df <- current_mutations %>%
left_join(full_coverage_df, by = c("chr", "genomic_pos"))
# Call mutations from reads
res_df <- NULL
for (i in 1:nrow(current_mutations_df)) {
# Mutation information
mutation_line <- current_mutations_df[i, ]
# Read information
obs_is_mut <- obs_is_mut_list[[i]]
error_ref_to_mut <- error_ref_to_mut_list[[i]]
error_mut_to_ref <- error_mut_to_ref_list[[i]]
# Run EM algorithm
em_res <- get_tf_estimate_vc(
obs_is_mut = obs_is_mut,
error_ref_to_mut = error_ref_to_mut,
error_mut_to_ref = error_mut_to_ref,
use_turboem = use_turboem
)
# Confidence intervals
if (calculate_confidence_intervals) {
tf_CI <- get_tf_CI(
obs_is_mut_list = obs_is_mut,
error_mut_to_ref_list = error_mut_to_ref,
error_ref_to_mut_list = error_ref_to_mut,
r_est = 1,
tf_est = em_res$tf_est,
alpha = alpha
)
} else {
tf_CI <- list(tf_min = NA, tf_max = NA)
}
# Test mutation
ll_0 <- log_likelihood(
tf = 0,
r = 1,
obs_is_mut_list = obs_is_mut,
error_ref_to_mut_list = error_ref_to_mut,
error_mut_to_ref_list = error_mut_to_ref
)
ll_A <- log_likelihood(
tf = em_res$tf_est,
r = 1,
obs_is_mut_list = obs_is_mut,
error_ref_to_mut_list = error_ref_to_mut,
error_mut_to_ref_list = error_mut_to_ref
)
Q_val <- -2 * (ll_0 - ll_A)
df <- 1
p_val <- stats::pchisq(Q_val, df, lower.tail = FALSE)
# Add result to output
new_row <- data.frame(
# Mutation annotations
mutation_line,
# EM results
em_res,
# Confidence interval
tf_min = tf_CI$tf_min, tf_max = tf_CI$tf_max,
# Misc.
exp_count = sum(error_ref_to_mut),
count = sum(obs_is_mut),
coverage = length(obs_is_mut),
obs_freq = mean(obs_is_mut),
# Test results
ll_0,
ll_A,
Q_val,
df,
p_val,
mutation_detected = p_val <= alpha
)
res_df <- rbind(res_df, new_row)
}
mutation_calls <- rbind(mutation_calls, res_df)
}
return(mutation_calls)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.