#' Extract training data from BAM files (indel)
#'
#' @param bam_paths Vector of strings. Paths to \code{.bam} files to extract
#' training data from.
#' @param reference_path String. Path to reference genome fasta file.
#' @param bed_include_path String. Path to bed-file with regions to include.
#' Default is \code{NULL}.
#' @param factor Number between 0 and 1. Ratio between negative and positive
#' data. Default is 1.
#' @param common_positions_to_exclude_paths Vector of strings. List of files
#' with positions to exclude from all samples. Default is \code{NULL}.
#' @param positions_to_exclude_paths Vector of strings. List of files with
#' positions to exclude from training with length equal to number of samples.
#' Default is \code{NULL}.
#' @param mm_rate_max Number between 0 and 1. Maximum mismatch rate in position.
#' Default is 1.
#' @param verbose TODO: Write this
#'
#' @return A list containing two elements:
#' \itemize{
#' \item `data`: A `tbl_df`
#' \item `info`: A `data.frame`
#' }
#' @export
#'
#' @seealso [train_dreams_model()] Function for training model.
get_training_data_indels <- function(bam_paths,
reference_path,
bed_include_path = NULL,
factor = 1,
common_positions_to_exclude_paths = NULL,
positions_to_exclude_paths = NULL,
mm_rate_max = 1,
verbose = F) {
# Check if there is a position exclude path for each bam file
if ((!is.null(positions_to_exclude_paths) &
(length(bam_paths) != length(positions_to_exclude_paths)))) {
stop("Wrong number of exclude paths")
}
training_data <- NULL
info <- NULL
n_bam_files = length(bam_paths)
for (bam_idx in 1:n_bam_files) {
bam_path <- bam_paths[[bam_idx]]
if (verbose) {
cat("file ", bam_idx, "/", n_bam_files, "\n")
}
# Combine sample specific position exclusion with common exclusion
if (!is.null(positions_to_exclude_paths)) {
current_positions_to_exclude_paths <- c(common_positions_to_exclude_paths, positions_to_exclude_paths[[bam_idx]])
} else {
current_positions_to_exclude_paths <- common_positions_to_exclude_paths
}
# Get training data for single bam file
current_training_data <- get_training_data_from_bam_indel(
bam_path = bam_path,
reference_path = reference_path,
bed_include_path = bed_include_path,
positions_to_exclude_paths = current_positions_to_exclude_paths,
factor = factor,
mm_rate_max = mm_rate_max
)
training_data <- rbind(training_data, current_training_data$data)
info <- rbind(info, current_training_data$info)
}
# Collect output info for beta calculation
output_info <- data.frame(
total_mismatches = sum(info$n_mismatches),
total_matches = sum(info$n_matches),
total_coverage = sum(info$total_coverage)
) %>% mutate(beta = .data$total_matches / (.data$total_coverage - .data$total_mismatches))
return(list(
data = training_data,
info = output_info
))
}
#' Extract Training Data from BAM File (indels)
#'
#' Extracts training data from a BAM file by integrating information from reference and BED files.
#' It processes genomic positions of indels, extracts features, filters based on mismatch rates,
#' and combines positive and negative samples to form the training dataset.
#'
#' @param bam_path Path to the BAM file.
#' @param reference_path Path to the reference genome file.
#' @param bed_include_path Optional; BED file defining regions to include in the analysis.
#' @param factor The ratio of negative to positive data in the output.
#' @param positions_to_exclude_paths Optional; paths to files defining positions to exclude from training.
#' @param mm_rate_max Maximum mismatch rate allowed in a position.
#' @keywords internal
#'
#' @return A list with two elements: `data`, a `data.frame` containing the combined positive and negative training data,
#' and `info`, a `data.frame` containing metadata about the training set.
#'
get_training_data_from_bam_indel <- function(bam_path, reference_path, bed_include_path = NULL, factor = 1, positions_to_exclude_paths = NULL, mm_rate_max = 1) {
bam_df <- load_BAM(bam_path)
# Add genomic positions of indels
indel_bam_df <- extract_indel_info(bam_df)
# Check if the resulting dataframe has no rows
if (nrow(indel_bam_df) == 0) {
# Print a warning message
warning("No indels were found in the sequence")
# Return an empty dataframe
return(data.frame())
}
# Add features
indels_positions_df <-
extract_features_from_bam_indels(
bam_df = indel_bam_df,
reference_path = reference_path
)
# Filter indels
indels <-
filter_mismatch_positions(
read_positions = indels_positions_df,
bam_file = bam_path,
mm_rate_max = mm_rate_max,
bed_include_path = bed_include_path,
positions_to_exclude_paths = positions_to_exclude_paths
)
positive_samples <- indels$data
info <- indels$info
n_samples <- nrow(positive_samples) * factor
# Generate negative samples
negative_read_positions_df <-
sample_negative_read_positions(
bam_df = bam_df,
n_samples = n_samples
)
# Add features
negative_samples <-
extract_features_from_bam_indels(
bam_df = negative_read_positions_df,
reference_path = reference_path
)
info <- info %>% mutate(n_matches = nrow(negative_samples))
info <- info %>% mutate(beta = nrow(negative_samples) / (info$total_coverage - nrow(positive_samples)))
output_data <- rbind(positive_samples, negative_samples)
output_list <- list(
data = output_data,
info = info
)
return(output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.