dreams: Deep Read-level Error Model for Sequencing data

Documented in call_cancer dreams_cc

#' Call cancer from read positions in bam-file
#'
#' @description This function evaluates the presence of cancer in a sample by combining the cancerous signal across a catalogue of candidate mutations.
#'
#' @inheritParams call_cancer
#' @param bam_file_path Path to .BAM-file
#' @param reference_path Path to reference genome e.g. FASTA-file.
#'
#' @return
#' A \code{list()} with:
#' \itemize{
#'   \item \strong{cancer_info} A [data.frame()] with results for cancer calling across all mutations:
#'     \describe{
#'       \item{tf_est}{The estiamted tumor fraction (allele fraction).}
#'       \item{tf_min, tf_max}{The confidence interval of \code{tf_est}.}
#'       \item{r_est, est_mutations_present}{The estiamted fraction/number of candidate mutations present in the sample.}
#'       \item{r_min, r_max}{The confidence interval of \code{r_est}.}
#'       \item{mutations_tested}{Number of candidate mutations tested.}
#'       \item{total_coverage, total_count}{Total count and coverage across all mutations (only reference and alternative allele(s).}
#'       \item{mutations_tested}{Number of candidate mutations tested.}
#'       \item{EM_converged}{If the EM algorithm converged.}
#'       \item{EM_steps, fpeval, objfeval}{Number of steps and function evaluations by the EM algorithm.}
#'       \item{ll_0, ll_A}{The value of the log-likelihood function under the null (tf=0) and alternative (tf>0) hypothesis.}
#'       \item{Q_val, df, p_val}{The chisq test statistic, degrees of freedom and p-value of the statistical test.}
#'       \item{cancer_detected}{Whether cancer was detected at the supplied alpha level.}
#'     }
#'
#'   \item \strong{mutation_info} A [data.frame()] with information about the individual mutations:
#'       \describe{
#'         \item{chr, genomic_pos}{The genomic position of the mutation.}
#'         \item{ref, alt}{The reference and alternative allele.}
#'         \item{P_mut_is_present}{The estimated probability the mutation is present in the sample.}
#'         \item{exp_count}{The expected count of the alternative allele under the error (null) model.}
#'         \item{count}{The count of the alternative allele.}
#'         \item{coverage}{The coverage used by the model (only referenceredas with and alternative allele).}
#'         \item{obs_freq}{The observed frequency of the alternative allele.}
#'      }
#'
#' }
#' @seealso [call_mutations()], [train_dreams_model()]
#'
#' @export
dreams_cc <- function(mutations_df, bam_file_path, reference_path, model,
                      alpha = 0.05, calculate_confidence_intervals = FALSE, use_turboem = TRUE) {
  # If no mutations return empty result
  if (nrow(mutations_df) == 0) {
    return(
      list(
        cancer_info = data.frame(),
        mutation_info = data.frame()
      )
    )
  }

  # Clean up mutations
  mutations_df <- mutations_df %>%
    select(
      "chr" = matches("chr|CHR|CHROM"),
      "genomic_pos" = matches("pos|POS"),
      "ref" = matches("ref|REF"),
      "alt" = matches("alt|ALT|obs|OBS")
    )


  # Get read positions
  read_positions_df <- get_read_positions_from_BAM(
    bam_file_path = bam_file_path,
    chr = mutations_df$chr,
    genomic_pos = mutations_df$genomic_pos,
    reference_path
  )

  beta <- get_training_data_from_bam(bam_file_path, reference_path)$info$beta

  # Call cancer
  cancer_call <- call_cancer(
      mutations_df = mutations_df,
      read_positions_df = read_positions_df,
      model = model,
      beta = beta,
      alpha = alpha,
      use_turboem = use_turboem,
      calculate_confidence_intervals = calculate_confidence_intervals
    )

  return(cancer_call)


}


#' Call cancer from read positions
#'
#' @description This function evaluates the presence of cancer in a sample by combining the cancerous signal across a catalogue of candidate mutations.
#'
#' @param mutations_df A [data.frame()] with candidate mutations (SNVs) (chromosome, positions, reference and alternative)
#' @param read_positions_df A [data.frame()] with read-positions. See [get_read_positions_from_BAM()]
#' @param model A dreams model. See [train_dreams_model()].
#' @param beta Down sampling parameter from [get_training_data()] for correcting the error-rates from the DREAMS model.
#' @param alpha Alpha-level used for testing and confidence intervals. Default is 0.05.
#' @param use_turboem Logical. Should [turboEM::turboem()] be used for EM algorithm? Default is TRUE.
#' @param calculate_confidence_intervals Logical. Should confidence intervals be calculated? Default is FALSE.
#'
#' @return
#' A \code{list()} with:
#' \itemize{
#'   \item \strong{cancer_info} A [data.frame()] with results for cancer calling across all mutations:
#'     \describe{
#'       \item{tf_est}{The estiamted tumor fraction (allele fraction).}
#'       \item{tf_min, tf_max}{The confidence interval of \code{tf_est}.}
#'       \item{r_est, est_mutations_present}{The estiamted fraction/number of candidate mutations present in the sample.}
#'       \item{r_min, r_max}{The confidence interval of \code{r_est}.}
#'       \item{mutations_tested}{Number of candidate mutations tested.}
#'       \item{total_coverage, total_count}{Total count and coverage across all mutations (only reference and alternative allele(s).}
#'       \item{mutations_tested}{Number of candidate mutations tested.}
#'       \item{EM_converged}{If the EM algorithm converged.}
#'       \item{EM_steps, fpeval, objfeval}{Number of steps and function evaluations by the EM algorithm.}
#'       \item{ll_0, ll_A}{The value of the log-likelihood function under the null (tf=0) and alternative (tf>0) hypothesis.}
#'       \item{Q_val, df, p_val}{The chisq test statistic, degrees of freedom and p-value of the statistical test.}
#'       \item{cancer_detected}{Whether cancer was detected at the supplied alpha level.}
#'     }
#'
#'   \item \strong{mutation_info} A [data.frame()] with information about the individual mutations:
#'       \describe{
#'         \item{chr, genomic_pos}{The genomic position of the mutation.}
#'         \item{ref, alt}{The reference and alternative allele.}
#'         \item{P_mut_is_present}{The estimated probability the mutation is present in the sample.}
#'         \item{exp_count}{The expected count of the alternative allele under the error (null) model.}
#'         \item{count}{The count of the alternative allele.}
#'         \item{coverage}{The coverage used by the model (only referenceredas with and alternative allele).}
#'         \item{obs_freq}{The observed frequency of the alternative allele.}
#'      }
#'
#' }
#' @seealso [call_mutations()], [train_dreams_model()]
#'
#' @export
call_cancer <- function(mutations_df, read_positions_df, model, beta, alpha = 0.05, calculate_confidence_intervals = FALSE, use_turboem = TRUE) {
  # If no mutations return empty result
  if (nrow(mutations_df) == 0) {
    return(
      list(
        cancer_info = data.frame(),
        mutation_info = data.frame()
      )
    )
  }

  # Clean up mutations
  mutations_df <- mutations_df %>%
    select(
      "chr" = matches("chr|CHR|CHROM"),
      "genomic_pos" = matches("pos|POS"),
      "ref" = matches("ref|REF"),
      "alt" = matches("alt|ALT|obs|OBS")
    )

  # Stop if mutations do not have the expected columns
  mutations_expected_columns <- c("chr", "genomic_pos", "ref", "alt")
  if (!all(mutations_expected_columns %in% colnames(mutations_df))) {
    stop("mutations_df should have the columns ['chr', genomic_pos', 'ref, 'alt']")
  }

  # Stop if reads do not have the expected columns
  reads_expected_columns <- c("chr", "genomic_pos", "ref", "obs")
  if (!all(reads_expected_columns %in% colnames(read_positions_df))) {
    stop("read_positions_df should have the columns ['chr', genomic_pos', 'ref, 'obs']")
  }

  # Prepare inputs for algorithm
  em_input <- prepare_em_input(mutations_df = mutations_df, read_positions_df = read_positions_df, model = model, beta = beta)
  obs_is_mut_list <- em_input$obs_is_mut_list
  error_ref_to_mut_list <- em_input$error_ref_to_mut_list
  error_mut_to_ref_list <- em_input$error_mut_to_ref_list


  # EM algorithm
  em_res <- get_em_parameter_estimates(
    obs_is_mut_list = obs_is_mut_list,
    error_ref_to_mut_list = error_ref_to_mut_list,
    error_mut_to_ref_list = error_mut_to_ref_list,
    use_turboem = use_turboem
  )

  # Confidence intervals
  if (calculate_confidence_intervals) {
    tf_CI <- get_tf_CI(
      obs_is_mut_list = obs_is_mut_list,
      error_mut_to_ref_list = error_mut_to_ref_list,
      error_ref_to_mut_list = error_ref_to_mut_list,
      r_est = em_res$r_est,
      tf_est = em_res$tf_est,
      alpha = alpha
    )
    r_CI <- get_r_CI(
      obs_is_mut_list = obs_is_mut_list,
      error_mut_to_ref_list = error_mut_to_ref_list,
      error_ref_to_mut_list = error_ref_to_mut_list,
      r_est = em_res$r_est,
      tf_est = em_res$tf_est,
      alpha = alpha
    )
  } else {
    tf_CI <- list(tf_min = NA, tf_max = NA)
    r_CI <- list(r_min = NA, r_max = NA)
  }

  # Test significance
  ll_0 <- log_likelihood(
    obs_is_mut_list = obs_is_mut_list,
    error_mut_to_ref_list = error_mut_to_ref_list,
    error_ref_to_mut_list = error_ref_to_mut_list,
    r = 0,
    tf = 0
  )

  ll_A <- log_likelihood(
    obs_is_mut_list = obs_is_mut_list,
    error_mut_to_ref_list = error_mut_to_ref_list,
    error_ref_to_mut_list = error_ref_to_mut_list,
    r = em_res$r_est,
    tf = em_res$tf_est
  )

  Q_val <- -2 * (ll_0 - ll_A)
  df <- 2
  p_val <- stats::pchisq(Q_val, df, lower.tail = FALSE)

  # Collect cancer information
  cancer_info <-
    data.frame(
      tf_est = em_res$tf_est,
      tf_min = tf_CI$tf_min, tf_max = tf_CI$tf_max,
      r_est = em_res$r_est,
      r_min = r_CI$r_min, r_max = r_CI$r_max,
      Q_val = Q_val,
      ll_A = ll_A,
      ll_0 = ll_0,
      mutations_tested = length(obs_is_mut_list),
      est_mutations_present = sum(em_res$P_mut_is_present),
      total_coverage = sum(sapply(obs_is_mut_list, length)),
      total_count = sum(sapply(obs_is_mut_list, sum)),
      EM_converged = em_res$EM_converged,
      EM_steps = em_res$EM_steps,
      fpeval = em_res$fpeval,
      objfeval = em_res$objfeval,
      p_val = p_val,
      cancer_detected = p_val <= alpha
    )

  # Collect mutation information
  mutation_info <-
    dplyr::bind_cols(
      mutations_df,
      data.frame(
        P_mut_is_present = em_res$P_mut_is_present,
        exp_count = sapply(error_ref_to_mut_list, sum),
        count = sapply(obs_is_mut_list, sum),
        coverage = sapply(obs_is_mut_list, length),
        obs_freq = sapply(obs_is_mut_list, mean)
      )
    )

  return(
    list(
      cancer_info = cancer_info,
      mutation_info = mutation_info
    )
  )
}