R/castle_simple.R

Defines functions test_tumor_sample_simple train_simple_ddpcr_model

Documented in test_tumor_sample_simple train_simple_ddpcr_model

#' @title Train simple CASTLE model
#'
#' @description Estimates the parameters of the statistical model for the simple
#'   version the CASTLE algorithm based on a set of training samples, assumed to
#'   have no mutant DNA present.
#'
#' @param background_samples \code{data.frame} with cancer negative training
#'   samples as rows. These are the samples used to train/estimate the
#'   parameters of the model. At least the following columns should be present:
#'   \itemize{
#'     \item{\code{WildtypeOnlyDroplets}}
#'     \item{\code{MutantOnlyDroplets}}
#'     \item{\code{DoubleNegativeDroplets}}
#'     \item{\code{DoublePositiveDroplets}} } If data from QuantaSoft is available,
#'   these can be imported using \code{\link{import_QS_files}}.
#'
#' @return List of parameter estimates for the simple CASLTE model. This should
#'   be used as input for \code{\link{test_tumor_sample_simple}}.
#' @seealso \code{\link{import_QS_files}},
#'   \code{\link{test_tumor_sample_simple}}
#' @export

train_simple_ddpcr_model <- function(background_samples) {
  # Check input
  check_input_samples(background_samples)

  # Check if samples are appropriate for training
  check_background_samples(background_samples)

  # Unpack data
  N_WT_only_vec <- background_samples$WildtypeOnlyDroplets
  N_M_only_vec <- background_samples$MutantOnlyDroplets
  N_d_neg_vec <- background_samples$DoubleNegativeDroplets
  N_d_pos_vec <- background_samples$DoublePositiveDroplets

  # Estimate lambdas
  l_est_vec <- log((N_WT_only_vec + N_d_pos_vec) / (N_d_neg_vec + N_M_only_vec) + 1)

  # Get starting values (rough estimates) of a, b and c
  start_values <- get_start_values_abc(
    N_WT_only_vec = N_WT_only_vec,
    N_M_only_vec = N_M_only_vec,
    N_d_neg_vec = N_d_neg_vec,
    N_d_pos_vec = N_d_pos_vec,
    l_est_vec = l_est_vec
  )

  # Set start at mean of individual samples
  start_vec <- log(c(
    start_values$a_start,
    start_values$b_start,
    start_values$c_start
  ))

  # Estimate alpha, beta and gamma by maximizing log-likelihood
  optim_res <- stats::optim(
    par = start_vec,
    method = "BFGS",
    fn = full_log_lik_train,
    gr = grad_full_log_lik_train,
    l_est_vec = l_est_vec,
    N_WT_only_vec = N_WT_only_vec,
    N_M_only_vec = N_M_only_vec,
    N_d_neg_vec = N_d_neg_vec,
    N_d_pos_vec = N_d_pos_vec,
    control = list(
      fnscale = -1,
      reltol = RELTOL
    )
  )

  par_est <- exp(optim_res$par)

  res <- list(
    a_est = par_est[1],
    b_est = par_est[2],
    c_est = par_est[3],
    l_est_vec = l_est_vec
  )
  return(res)
}

#' @title Test samples using simple CASTLE model
#'
#' @description Tests samples for the presence of mutant DNA using the
#'   simple version of the CASTLE algorithm.
#'
#' @param test_samples \code{data.frame} with samples to be tested as rows. At
#'   least the following columns should be present:
#'     \itemize{
#'       \item{\code{WildtypeOnlyDroplets}}
#'       \item{\code{MutantOnlyDroplets}}
#'       \item{\code{DoubleNegativeDroplets}}
#'       \item{\code{DoublePositiveDroplets}}
#'    }
#' If data from QuataSoft is available, these can be imported using
#' \code{\link{import_QS_files}}.
#' @param simple_model A fitted model as generated by
#'   \code{\link{train_simple_ddpcr_model}}.
#' @param include_mutant_CI Logical. Should confidence intervals for mutant
#'   DNA statistics be included? Default is \code{TRUE}.
#' @param include_wildtype_CI Logical. Should confidence intervals for wild type
#'   DNA statistics be included? Default is \code{TRUE}.
#' @param alpha The alpha used for the statistical test and confidence
#'   intervals. Default is 0.01.
#'
#' @return A \code{data.frame} with results from the CASTLE algorithm (parameter
#'   estimates, CIs, p-values etc.) joined to the right of \code{test_samples}.
#'   Following is the full list of information added:
#'   \describe{ \item{\code{test_statistic}/\code{p_val}}{Log-likelihood-ratio
#'   test statistic/p-value from the statistical test for the presence of mutant
#'   DNA (H0: No mutant DNA, HA: Mutant DNA present).}
#'   \item{\code{mutation_detected}}{Logical. TRUE if the mutation is detected
#'   i.e. if the p-value < \code{alpha}.}
#'   \item{\code{allele_frequency}}{Estimated allele frequency of the mutant allele (error corrected).}
#'   \item{\code{allele_frequency_CI_[lower|upper]}}{Confidence interval for
#'   \code{allele_frequency}.}
#'   \item{\code{mutant_molecules_per_droplet}}{Estimated number of mutant
#'   molecules per droplet (error corrected).}
#'   \item{\code{mutant_molecules_per_droplet_CI_[lower|upper]}}{Confidence
#'   interval for \code{mutant_molecules_per_droplet}.}
#'   \item{\code{wildtype_molecules_per_droplet}}{Estimated number of wildtype
#'   molecules per droplet.}
#'   \item{\code{wildtype_molecules_per_droplet_CI_[lower|upper]}}{Confidence
#'   interval for \code{wildtype_molecules_per_droplet}.}
#'   \item{\code{total_mutant_molecules}}{Estimated total number of mutant
#'   molecules in sample (error corrected).}
#'   \item{\code{total_mutant_molecules_CI_[lower|upper]}}{Confidence interval
#'   for \code{total_mutant_molecules}.}
#'   \item{\code{total_wildtype_molecules}}{Estimated number of wildtype
#'   molecules per droplet.} }
#'
#' @seealso \code{\link{train_simple_ddpcr_model}}
#' @export
test_tumor_sample_simple <- function(test_samples,
                                     simple_model,
                                     include_wildtype_CI = TRUE,
                                     include_mutant_CI = TRUE,
                                     alpha = 0.01) {

  # Check input
  check_input_samples(test_samples)

  res_df <- NULL
  for (i in 1:nrow(test_samples)) {
    test_sample <- test_samples[i, ]

    # Unpack parameters
    N_WT_only <- test_sample$WildtypeOnlyDroplets
    N_M_only <- test_sample$MutantOnlyDroplets
    N_d_neg <- test_sample$DoubleNegativeDroplets
    N_d_pos <- test_sample$DoublePositiveDroplets

    a_est <- simple_model$a_est
    b_est <- simple_model$b_est
    c_est <- simple_model$c_est

    # Estimate parameters
    l_est <- estimate_l(
      N_WT_only = N_WT_only,
      N_M_only = N_M_only,
      N_d_neg = N_d_neg,
      N_d_pos = N_d_pos
    )

    r_est <- estimate_r(
      N_WT_only = N_WT_only,
      N_M_only = N_M_only,
      N_d_neg = N_d_neg,
      N_d_pos = N_d_pos,
      l_est = l_est,
      a_est = a_est,
      b_est = b_est,
      c_est = c_est
    )

    res <- test_r_equal_0_simple(
      N_WT_only = N_WT_only,
      N_M_only = N_M_only,
      N_d_neg = N_d_neg,
      N_d_pos = N_d_pos,
      l_est = l_est,
      a_est = a_est,
      b_est = b_est,
      c_est = c_est,
      r_est = r_est,
      include_mutant_CI = include_mutant_CI,
      alpha = alpha
    )

    if (include_wildtype_CI) {
      l_CI <- get_l_CI(
        N_M_only = N_M_only,
        N_d_neg = N_d_neg,
        N_WT_only = N_WT_only,
        N_d_pos = N_d_pos,
        alpha = alpha
      )

      # Add CI's to results
      res <- append(res, list(
        # Estimated values for l:
        wildtype_molecules_per_droplet_CI_lower = l_CI$lower,
        wildtype_molecules_per_droplet_CI_upper = l_CI$upper
      ))
    }

    # Collect results
    res_df <- dplyr::bind_rows(res_df, dplyr::bind_cols(test_sample, res))
  }
  return(res_df)
}
simondrue/castle documentation built on Jan. 29, 2022, 3:04 a.m.