R/extract_features_from_bam.R

Defines functions get_reference_seq extract_features_from_bam calc_string_entropy_k_mer

Documented in calc_string_entropy_k_mer extract_features_from_bam get_reference_seq

#' Calculate Shannon Entropy for K-mers in a String
#'
#' @description This function computes the Shannon entropy for all possible k-mers within a given string.
#' Shannon entropy is a measure of the unpredictability of the k-mer composition, with higher
#' values indicating more diversity. This function is useful in bioinformatics for analyzing the
#' complexity of sequences.
#'
#' @param s A string for which the Shannon entropy is to be calculated.
#' @param k The size of the k-mers (substrings of length k) to be considered for entropy calculation.
#' The default value is 2.
#' @param alphabet A vector of characters representing the possible nucleotides or readings in the string.
#' The default set includes "A", "C", "G", "T", "N".
#'
#' @return Returns a numeric value representing the Shannon entropy of the k-mers in the string.
#' This entropy value is a measure of the randomness or diversity of k-mer composition.
#' @keywords internal
#'
#' @importFrom dplyr %>%
calc_string_entropy_k_mer <- function(s, k = 2, alphabet = c("A", "C", "G", "T", "N")) {

  # Generate k-mers
  alphabet_k_rep_list <- rep(list(alphabet), k)
  k_mer_df <- expand.grid(alphabet_k_rep_list)
  k_mer_vec <- apply(k_mer_df, 1, paste0, collapse = "")

  s_length <- nchar(s) - (k - 1)

  count_occurrences <- function(patterns, in_strings) {
    counts_list <- lapply(in_strings, function(in_string) {
      sapply(patterns, function(pattern) {
        pattern_length <- nchar(pattern)
        num_matches <- 0
        for (i in 1:(nchar(in_string) - pattern_length + 1)) {
          substring <- substr(in_string, i, i + pattern_length - 1)
          if (substring == pattern) {
            num_matches <- num_matches + 1
          }
        }
        num_matches
      })
    })
    counts_matrix <- do.call(cbind, counts_list)
    colnames(counts_matrix) <- in_strings
    return(counts_matrix)
  }

  count_mat_matrix <- count_occurrences(k_mer_vec, s)

  freq_mat <- t(count_mat_matrix) / s_length



  # Shannon entropy
  H <- -rowSums(freq_mat * log10(freq_mat), na.rm = TRUE)

  return(as.numeric(H))
}


#' Extract Features from BAM Data
#'
#' @param bam_df A DataFrame originating from 'load_BAM' and processed by extract_mismatch_positions' containing alignment data.
#' @param reference_path Path to the reference genome file in FASTA format.
#' @param add_umi_features Check if umi information is available.
#'
#' @return A DataFrame with extracted features, including read positions and possibly UMI data.
#' @keywords internal
#'
#' @importFrom purrr map2_int
#'
#'
extract_features_from_bam <- function(bam_df, reference_path, add_umi_features = all(c("cd", "ce") %in% colnames(bam_df))) {

  # If UMI features are asked for but not present
  if (add_umi_features & !all(c("cd", "ce") %in% colnames(bam_df))) {
    stop("UMI features (ce and cd) are not available in bam_df!")
  }

  if (nrow(bam_df) == 0) {
    return(data.frame())
  }

  # Make genomic position features
  genomic_pos_feature_df <-
    distinct(
      base::data.frame(
        chr = bam_df$chr,
        genomic_pos = bam_df$genomic_pos,
        strand = bam_df$strand
      )
    ) %>%
    mutate(
      context11 =
        get_reference_seq(
        chr = .data$chr,
        genomic_pos = .data$genomic_pos,
        buffer = 5,
        reference_path = reference_path
      ),
      local_complexity_1 = calc_string_entropy_k_mer(.data$context11, k = 1),
      local_complexity_2 = calc_string_entropy_k_mer(.data$context11, k = 2),
      local_GC = str_count(.data$context11, "[GC]") / 11,
      ctx_minus1 = substring(.data$context11, 5, 5),
      ref = substring(.data$context11, 6, 6),
      ctx_plus1 = substring(.data$context11, 7, 7),
      trinucleotide_ctx = paste0(.data$ctx_minus1, .data$ref, .data$ctx_plus1)
    )

  # Make read specific features
  read_feature_df <-
    bam_df %>%
    mutate(
      pos_idx = .data$genomic_pos - .data$pos + 1
    ) %>%
    correct_pos_idx_w_cigar() %>%
    mutate(
      obs = ifelse(
        .data$is_in_deletion,
        "N",
        substring(.data$seq, .data$pos_idx, .data$pos_idx)
      ),
      fragment_size = abs(.data$isize),
      seq_length = nchar(.data$seq),
      read_index = if_else(.data$strand == "fwd", .data$pos_idx, .data$seq_length - .data$pos_idx + 1),
      first_in_pair = as.integer(as.logical(bitwAnd(.data$flag, 64))),
      n_errors_in_read = str_count(.data$MD, "\\d+[ATCG]"),
      n_insertions_in_read = str_count(.data$cigar, "I"),
      n_deletions_in_read = str_count(.data$cigar, "D")
    ) %>%
    # TODO: Move to filter function! Or do before calling this function!
    filter(.data$fragment_size != 0)

  # Features for output
  selected_features <-
    c(
      "qname", "chr", "genomic_pos", "obs", "ref",
      "strand", "first_in_pair", "read_index", "fragment_size",
      "ctx_minus1", "ctx_plus1", "trinucleotide_ctx", "context11",
      "local_complexity_1", "local_complexity_2", "local_GC",
      "n_other_errors", "n_insertions_in_read", "n_deletions_in_read", "seq_length"
    )

  # Add UMI features if asked
  if (add_umi_features) {
    read_feature_df <-
      read_feature_df %>%
      mutate(
        umi_count = map2_int(.data$cd, .data$pos_idx, function(cd, pos_idx) cd[pos_idx]),
        umi_errors = map2_int(.data$ce, .data$pos_idx, function(ce, pos_idx) ce[pos_idx])
      )

    # Add UMI features to selection
    selected_features <- c(selected_features, c("umi_count", "umi_errors"))
  }

  # Join and select features: Read, genomic positions and UMI
  feature_df <-
    left_join(read_feature_df, genomic_pos_feature_df,
      by = c("chr", "genomic_pos", "strand")
    ) %>%
    mutate(
      n_other_errors = .data$n_errors_in_read - ifelse((!.data$is_in_deletion) & (.data$obs != .data$ref), 1, 0)
    ) %>%
    select(all_of(selected_features))
  return(feature_df)
}

#' Get reference sequence
#'
#' @param chr chromosome
#' @param genomic_pos genomic_position
#' @param buffer how many neighbors to include
#' @param reference_path reference genome fa
#'
#' @importFrom Rsamtools FaFile getSeq
#' @keywords internal

get_reference_seq <- function(chr, genomic_pos, buffer, reference_path) {
  FaFile <- FaFile(reference_path)

  Fa <- getSeq(FaFile, param = GRanges(chr, IRanges(genomic_pos - buffer, genomic_pos + buffer)))

  return(as.character(Fa, use.names = FALSE))
}
JakobPedersenLab/dreams documentation built on Feb. 2, 2024, 3:14 p.m.