R/castle_integrated.R

Defines functions test_tumor_sample_integrated train_integrated_ddpcr_model

Documented in test_tumor_sample_integrated train_integrated_ddpcr_model

#' @title Train integrated CASTLE model
#'
#' @description Estimates the parameters of the statistical model for the
#'   integrated version the CASTLE algorithm based on a set of training samples,
#'   assumed to have no mutant DNA present.
#'
#' @details In the test an integrated version of the Likelihood Ratio test
#'   statistic is used, and this is approximated using a Riemann sum on a
#'   3D-grid. The parameter \code{abc_grid_resolution} controls this
#'   approximation as the sum is calculated on \code{abc_grid_resolution^3}
#'   points. Higher values give better approximation but longer computation
#'   times.
#'
#' @param background_samples \code{data.frame} with cancer negative training
#'   samples as rows. These are the samples used to train/estimate the
#'   parameters of the model. At least the following columns should be present:
#'   \itemize{
#'     \item{\code{WildtypeOnlyDroplets}}
#'     \item{\code{MutantOnlyDroplets}}
#'     \item{\code{DoubleNegativeDroplets}}
#'     \item{\code{DoublePositiveDroplets}} } If data from QuataSoft is available,
#'   these can be imported using \code{\link{import_QS_files}}.
#' @param abc_grid_resolution The resolution of the 3D-grid on which the
#'   integrated LR-test statistic is approximated. Default is 25 (equal to a
#'   grid of 25^3 = 15,625 points).
#'
#' @return List of parameter estimates for the integrated CASLTE model. This
#'   should be used as input for \code{\link{test_tumor_sample_integrated}}.
#'
#' @seealso \code{\link{import_QS_files}, \link{test_tumor_sample_integrated}}
#'
#' @export
train_integrated_ddpcr_model <- function(background_samples,
                                         abc_grid_resolution = 25) {
  # Train parameters (get MLE on training data)
  # NOTE: This checks validity of the input!
  simple_model <- train_simple_ddpcr_model(
    background_samples = background_samples
  )

  # Unpack data
  N_WT_only_vec <- background_samples$WildtypeOnlyDroplets
  N_M_only_vec <- background_samples$MutantOnlyDroplets
  N_d_neg_vec <- background_samples$DoubleNegativeDroplets
  N_d_pos_vec <- background_samples$DoublePositiveDroplets

  # Get parameters from model
  l_est_vec <- simple_model$l_est_vec
  a_est <- simple_model$a_est
  b_est <- simple_model$b_est
  c_est <- simple_model$c_est

  # Get bounds of a, b and c to make a grid where training lik function has mass
  bounds <- find_abd_confidence_intervals(
    N_WT_only_vec = N_WT_only_vec,
    N_M_only_vec = N_M_only_vec,
    N_d_neg_vec = N_d_neg_vec,
    N_d_pos_vec = N_d_pos_vec,
    l_est_vec = l_est_vec,
    a_est = a_est,
    b_est = b_est,
    c_est = c_est,
    alpha = 1e-5
  )

  # Get symmetric buffer around estimate
  a_buffer <- max(a_est - bounds$a_CI_lower, bounds$a_CI_upper - a_est)
  b_buffer <- max(b_est - bounds$b_CI_lower, bounds$b_CI_upper - b_est)
  c_buffer <- max(c_est - bounds$c_CI_lower, bounds$c_CI_upper - c_est)

  # Get min and max values for error parameters
  a_min <- max(TOL_0, a_est - a_buffer)
  a_max <- a_est + a_buffer

  b_min <- max(TOL_0, b_est - b_buffer)
  b_max <- b_est + b_buffer

  c_min <- max(TOL_0, c_est - c_buffer)
  c_max <- c_est + c_buffer

  # Make intervals
  a_vec <- seq(from = a_min, to = a_max, length.out = abc_grid_resolution)
  b_vec <- seq(from = b_min, to = b_max, length.out = abc_grid_resolution)
  c_vec <- seq(from = c_min, to = c_max, length.out = abc_grid_resolution)

  # Grid of a, b and c
  abc_grid <- expand.grid(a_vec, b_vec, c_vec)

  # Get training log lik
  abc_grid_train_log_lik <- get_abc_grid_log_lik(
    N_WT_only_vec = N_WT_only_vec,
    N_M_only_vec = N_M_only_vec,
    N_d_neg_vec = N_d_neg_vec,
    N_d_pos_vec = N_d_pos_vec,
    l_vec = l_est_vec,
    abc_grid = abc_grid
  )

  # Gather results
  integrated_model <- list(
    a_est = a_est,
    b_est = b_est,
    c_est = c_est,
    l_est_vec = l_est_vec,
    abc_grid = abc_grid,
    abc_grid_train_log_lik = abc_grid_train_log_lik
  )

  return(integrated_model)
}

#' @title Test sample using integrated CASTLE model
#'
#' @description Tests samples for the presence of mutant DNA using the
#'   integrated version of the CASTLE algorithm.
#'
#' @param test_samples \code{data.frame} with samples to be tested as rows. At
#'   least the following columns should be present:
#'     \itemize{
#'       \item{\code{WildtypeOnlyDroplets}}
#'       \item{\code{MutantOnlyDroplets}}
#'       \item{\code{DoubleNegativeDroplets}}
#'       \item{\code{DoublePositiveDroplets}}
#'    }
#' If data from QuataSoft is available, these can be imported using
#' \code{\link{import_QS_files}}.
#' @param integrated_model A fitted model as generated by
#'   \code{\link{train_integrated_ddpcr_model}}.
#' @param include_mutant_CI Logical. Should confidence intervals for mutant
#'   DNA statistics be included? Default is \code{TRUE}.
#' @param include_wildtype_CI Logical. Should confidence intervals for wild type
#'   DNA statistics be included? Default is \code{TRUE}.
#' @param alpha The alpha used for the statistical test and confidence
#'   intervals. Default is 0.01.
#'
#' @return A \code{data.frame} with results from the CASTLE algorithm (parameter
#'   estimates, CIs, p-values etc.) joined to the right of \code{test_samples}.
#'   Following is the full list of information added:
#'   \describe{ \item{\code{test_statistic}/\code{p_val}}{Log-likelihood-ratio
#'   test statistic/p-value from the statistical test for the presence of mutant
#'   DNA (H0: No mutant DNA, HA: Mutant DNA present).}
#'   \item{\code{mutation_detected}}{Logical. TRUE if the mutation is detected
#'   i.e. if the p-value < \code{alpha}.}
#'   \item{\code{allele_frequency}}{Estimated allele frequency of the mutant allele (error corrected).}
#'   \item{\code{allele_frequency_CI_[lower|upper]}}{Confidence interval for
#'   \code{allele_frequency}.}
#'   \item{\code{mutant_molecules_per_droplet}}{Estimated number of mutant
#'   molecules per droplet (error corrected).}
#'   \item{\code{mutant_molecules_per_droplet_CI_[lower|upper]}}{Confidence
#'   interval for \code{mutant_molecules_per_droplet}.}
#'   \item{\code{wildtype_molecules_per_droplet}}{Estimated number of wildtype
#'   molecules per droplet.}
#'   \item{\code{wildtype_molecules_per_droplet_CI_[lower|upper]}}{Confidence
#'   interval for \code{wildtype_molecules_per_droplet}.}
#'   \item{\code{total_mutant_molecules}}{Estimated total number of mutant
#'   molecules in sample (error corrected).}
#'   \item{\code{total_mutant_molecules_CI_[lower|upper]}}{Confidence interval
#'   for \code{total_mutant_molecules}.}
#'   \item{\code{total_wildtype_molecules}}{Estimated number of wildtype
#'   molecules per droplet.}
#'   }
#'
#' @seealso \code{\link{train_integrated_ddpcr_model}}
#' @export
#'
#' @import dplyr
test_tumor_sample_integrated <- function(test_samples,
                                         integrated_model,
                                         alpha = 0.01,
                                         include_mutant_CI = TRUE,
                                         include_wildtype_CI = TRUE) {
  # Check input
  check_input_samples(test_samples)

  # Test samples
  res_df <- NULL
  for (i in 1:nrow(test_samples)) {
    # Unpack input for sample i
    test_sample <- test_samples[i, ]

    # Data
    N_WT_only <- test_sample$WildtypeOnlyDroplets
    N_M_only <- test_sample$MutantOnlyDroplets
    N_d_neg <- test_sample$DoubleNegativeDroplets
    N_d_pos <- test_sample$DoublePositiveDroplets

    # Model parameters
    abc_grid <- integrated_model$abc_grid
    abc_grid_train_log_lik <- integrated_model$abc_grid_train_log_lik

    # Estimate l and r for test sample
    # Get starting value from simple method
    l_est <- estimate_l(
      N_WT_only = N_WT_only,
      N_M_only = N_M_only,
      N_d_neg = N_d_neg,
      N_d_pos = N_d_pos
    )

    r_est <- estimate_r_integrated(
      N_WT_only = N_WT_only,
      N_M_only = N_M_only,
      N_d_neg = N_d_neg,
      N_d_pos = N_d_pos,
      l_est = l_est, integrated_model = integrated_model
    )

    res <- test_r_equal_0_integrated(
      l_est = l_est,
      r_est = r_est,
      N_WT_only = N_WT_only,
      N_M_only = N_M_only,
      N_d_neg = N_d_neg,
      N_d_pos = N_d_pos,
      abc_grid_train_log_lik = abc_grid_train_log_lik,
      abc_grid = abc_grid,
      include_mutant_CI = include_mutant_CI,
      alpha = alpha
    )

    if (include_wildtype_CI) {
      l_CI <- get_l_CI(
        N_M_only = N_M_only,
        N_d_neg = N_d_neg,
        N_WT_only = N_WT_only,
        N_d_pos = N_d_pos,
        alpha = alpha
      )

      # Add CI's to results
      res <- append(res, list(
        # Estimated values for l:
        wildtype_molecules_per_droplet_CI_lower = l_CI$lower,
        wildtype_molecules_per_droplet_CI_upper = l_CI$upper
      ))
    }

    # Collect results
    res_df <- bind_rows(res_df, bind_cols(test_sample, res))
  }

  return(res_df)
}
simondrue/castle documentation built on Jan. 29, 2022, 3:04 a.m.