R/pairwise_variant_epistasis.R

Defines functions pairwise_variant_epistasis

Documented in pairwise_variant_epistasis

#' Estimate selection under pairwise epistasis model
#' 
#' @param cesa CESAnalysis
#' @param samples Validated samples data.table (as from select_samples())
#' @param variant_pair 2-length character of variant IDs, or 2-length numeric giving
#'   indices of CompoundVariantSet for the current two compound variants
#' @param compound_variants If testing a pair of compound variants, the CompoundVariantSet defining them
#' @param model Passed from ces_epistasis or ces_gene_epistasis. Set to "default" to use built-in
#'   model of epistatic selection, or supply a custom function factory (see details).
#' @param lik_args Extra arguments, given as a list, passed from ces_epistasis or ces_gene_epistasis
#' @param optimizer_args List of arguments to pass to the optimizer (bbmle::mle2).
#' @param pval_calc_fn For use with custom models; optional. A function that takes an epistasis
#'   model fit as input and returns p-values and other descriptives.
#' 
#' @keywords internal
pairwise_variant_epistasis = function(cesa, variant_pair, samples, conf, compound_variants = NULL, model = "default", 
                                      lik_args = list(), optimizer_args = list(), pval_calc_fn = NULL) {
  
  running_compound = FALSE
  if (is(compound_variants, "CompoundVariantSet")) {
    running_compound = TRUE
    compound_variants = compound_variants[variant_pair]
    joint_coverage = c("genome", intersect(compound_variants@compounds$shared_cov[[1]], compound_variants@compounds$shared_cov[[2]]))
    v1 = compound_variants@compounds$compound_name[[1]]
    v2 = compound_variants@compounds$compound_name[[2]]
    v1_ids = compound_variants@snvs[compound_name == v1, snv_id]
    v2_ids = compound_variants@snvs[compound_name == v2, snv_id]
    variant_ids = c(v1_ids, v2_ids)
  } else {
    v1 = variant_pair[1]
    v2 = variant_pair[2]
    
    v1_coverage = c(cesa@mutations$amino_acid_change[v1, unlist(covered_in), nomatch = NULL], 
                    cesa@mutations$snv[v1, unlist(covered_in), nomatch = NULL])
    v2_coverage = c(cesa@mutations$amino_acid_change[v2, unlist(covered_in), nomatch = NULL], 
                    cesa@mutations$snv[v2, unlist(covered_in), nomatch = NULL])
    
    # Samples have to have v1 and v2 coverage (and samples with covered_regions == "genome" always have coverage)
    joint_coverage = c("genome", intersect(v1_coverage, v2_coverage))
    variant_ids = c(v1, v2)
  }
  
  eligible_tumors = samples[covered_regions %in% joint_coverage, Unique_Patient_Identifier]
  if (length(eligible_tumors) == 0) {
    warning(sprintf("No samples have coverage at both %s and %s, so this variant pair had to be skipped.", v1, v2), immediate. = T, call. = F)
  }
  all_rates = baseline_mutation_rates(cesa = cesa, variant_ids = variant_ids, samples = eligible_tumors)
  
  if (running_compound) {
    # sample_calls could include tumors that don't have coverage across both in pair
    tumors_with_v1 = intersect(compound_variants@sample_calls[[v1]], eligible_tumors)
    tumors_with_v2 = intersect(compound_variants@sample_calls[[v2]], eligible_tumors)
    tumors_with_both = intersect(tumors_with_v1, tumors_with_v2)
    tumors_just_v1 = setdiff(tumors_with_v1, tumors_with_both)
    tumors_just_v2 = setdiff(tumors_with_v2, tumors_with_both)
    tumors_with_neither = setdiff(eligible_tumors, c(tumors_with_v1, tumors_with_v2))
    
    v1_rates = all_rates[, ..v1_ids]
    v1_rates = rowSums(v1_rates)
    names(v1_rates) = all_rates$Unique_Patient_Identifier
    v2_rates = all_rates[, ..v2_ids]
    v2_rates = rowSums(v2_rates)
    names(v2_rates) = all_rates$Unique_Patient_Identifier
    
    # 2-item lists: first item is baseline rates for v1; second for v2
    with_just_1 = NULL; with_just_2 = NULL; with_both = NULL; with_neither = NULL
    if(length(tumors_just_v1) > 0){
      with_just_1 = list(v1_rates[tumors_just_v1], v2_rates[tumors_just_v1])
    }
    if(length(tumors_just_v2) > 0) {
      with_just_2 = list(v1_rates[tumors_just_v2], v2_rates[tumors_just_v2])
    }
    if (length(tumors_with_both) > 0) {
      with_both = list(v1_rates[tumors_with_both], v2_rates[tumors_with_both])
    }
    if(length(tumors_with_neither) > 0) {
      with_neither = list(v1_rates[tumors_with_neither], v2_rates[tumors_with_neither])
    }
  } else {
    setcolorder(all_rates, c("Unique_Patient_Identifier", v1, v2))
    setkey(all_rates, "Unique_Patient_Identifier")
    covered_maf = cesa@maf[eligible_tumors, on = "Unique_Patient_Identifier", nomatch = NULL]
    tumors_with_v1 = intersect(samples_with(cesa, v1), eligible_tumors)
    tumors_with_v2 = intersect(samples_with(cesa, v2), eligible_tumors)
    tumors_with_both = intersect(tumors_with_v1, tumors_with_v2)
    tumors_just_v1 = setdiff(tumors_with_v1, tumors_with_both)
    tumors_just_v2 = setdiff(tumors_with_v2, tumors_with_both)
    tumors_with_neither = setdiff(eligible_tumors, c(tumors_with_v1, tumors_with_v2))
    
    # 2-item lists: first item is baseline rates for v1; second for v2
    with_just_1 = lapply(as.list(all_rates[tumors_just_v1])[2:3], setNames, tumors_just_v1)
    with_just_2 = lapply(as.list(all_rates[tumors_just_v2])[2:3], setNames, tumors_just_v2)
    with_both = lapply(as.list(all_rates[tumors_with_both])[2:3], setNames, tumors_with_both)
    with_neither = lapply(as.list(all_rates[tumors_with_neither])[2:3], setNames, tumors_with_neither)
  }
  
  validate_optimizer_args(optimizer_args)
  running_default = FALSE
  if(is(model, "character")) {
    if(length(model) != 1 || ! model %in% c("default")) {
      stop("model should specify a built-in selection model (i.e., \"default\") or a custom function factory.")
    } else {
      if (model == 'default') {
        if(! is.null(pval_calc_fn)) {
          warning('pval_calc_fn should usually be NULL when using the default epistasis model.')
          if(! is(pval_calc_fn, 'function') || ! identical(names(formals(pval_calc_fn)), 'fit')) {
            stop('Invalid pval_calc_fn: Must be a function with a single named argument (fit).')
          }
        } else {
          pval_calc_fn = default_epistasis_pvalue_calc
        }
        running_default = TRUE
        lik_factory = pairwise_epistasis_lik
      } else {
        stop("Unrecognized model")
      }
    }
  } else if (! is(model, "function")) {
    stop("model should specify a built-in selection model (\"default\") or a custom function factory.")
  } else {
    lik_factory = model
    if(! is.null(pval_calc_fn) && 
       (! is(pval_calc_fn, 'function') || ! identical(names(formals(pval_calc_fn)), 'fit'))) {
      stop('pval_calc_fn should be NULL or a function that takes a single named argument (fit, the epistatic model fit object).')
    }
  }
  
  if(! is(lik_args, "list")) {
    stop("lik_args should be named list") 
  }
  
  if(length(lik_args) > 0 && is.character(model)){
    if(model %in% c('default')) {
      stop("lik_args aren't used in the chosen model.")
    }
  }
  if(length(lik_args) != uniqueN(names(lik_args))) {
    stop('lik_args should be a named list without repeated names.')
  }
  
  lik_args = c(list(with_just_1 = with_just_1, with_just_2 = with_just_2, with_both = with_both, with_neither = with_neither), lik_args)
  
  # call factory function to get variant-specific likelihood function
  epistasis_lik_fn = do.call(lik_factory, lik_args)
  par_init = formals(epistasis_lik_fn)[[1]]
  names(par_init) = bbmle::parnames(epistasis_lik_fn)
  
  
  # No point of testing epistasis if either variant doesn't appear
  n_total = length(tumors_just_v1) + length(tumors_just_v2) + length(tumors_with_neither) + length(tumors_with_both)
  if (length(tumors_with_v1) == 0 || length(tumors_with_v2) == 0) {
    early_output = list(variant_A = v1, variant_B = v2,
                        nA0 = length(tumors_just_v1),
                        nB0 = length(tumors_just_v2),
                        nAB = length(tumors_with_both),
                        n00 = length(tumors_with_neither),
                        n_total = n_total)
    return(list(summary = early_output, fit = NULL)) # fit list will have a NULL entry
  }
  
  # find optimized selection intensities
  # the selection intensity when some group has 0 variants will be on the lower boundary; will muffle the associated warning
  final_optimizer_args = c(list(minuslogl = epistasis_lik_fn, start = par_init, vecpar = T), optimizer_args)
  if(is.null(final_optimizer_args$control$ndeps)) {
    final_optimizer_args$control$ndeps = rep(.1, length(par_init))
  }
  if(is.null(final_optimizer_args$method)) {
    final_optimizer_args$method = 'L-BFGS-B'
  }
  if(is.null(final_optimizer_args$lower)) {
    final_optimizer_args$lower = 1e-3
  }
  if(is.null(final_optimizer_args$upper)) {
    final_optimizer_args$upper = 1e9
  }
  
  withCallingHandlers(
    {
      fit = do.call(bbmle::mle2, final_optimizer_args)
    },
    warning = function(w) {
      if (startsWith(conditionMessage(w), "some parameters are on the boundary")) {
        invokeRestart("muffleWarning")
      }
    }
  )

  # Calculate p-values and other descriptives
  p_values_and_more = list()
  if(! is.null(pval_calc_fn)) {
    # Convenient for fit to know the names of its variants.
    attr(fit, 'v1') = v1
    attr(fit, 'v2') = v2
    p_values_and_more = pval_calc_fn(fit)
  }
  
  params = bbmle::coef(fit)
  if(running_default) {
    si_output = list(ces_A0 = params[1], ces_B0 = params[2],
                     ces_A_on_B = params[3], ces_B_on_A = params[4])
  } else {
    si_output = as.list(bbmle::coef(fit))
  }
  
  variant_ep_results = c(list(variant_A = v1, variant_B = v2),
                         si_output,
                         p_values_and_more,
                         list(nA0 = length(tumors_just_v1), nB0 = length(tumors_just_v2), nAB = length(tumors_with_both),
                              n00 = length(tumors_with_neither), n_total = n_total))
  if(! is.null(conf)) {
    if(running_default) {
      bbmle::parnames(epistasis_lik_fn) = c('ces_A0', 'ces_B0', 'ces_A_on_B', 'ces_B_on_A')
      ci = univariate_si_conf_ints(fit, epistasis_lik_fn, final_optimizer_args$lower, final_optimizer_args$upper, conf)
    } else {
      withCallingHandlers(
        {
          ci = bbmle::confint(fit, level = conf, quietly = TRUE)
        },
        warning = function(w) {
          if (conditionMessage(w) %like% 'reverting from spline to linear approximation') {
            invokeRestart("muffleWarning")
          }
        })
      
      ci_dt = as.data.table(ci, keep.rownames = 'param')
      ci_dt[, low_name := paste('ci_low', conf * 100, param, sep = '_')]
      ci_dt[, high_name := paste('ci_high', conf * 100, param, sep = '_')]
      ci = unlist(S4Vectors::zipup(ci_dt[[2]], ci_dt[[3]]))
      names(ci) = unlist(S4Vectors::zipup(ci_dt$low_name, ci_dt$high_name))
    }
    variant_ep_results = c(variant_ep_results, ci)
  }
  
  # To reduce size, remove copy of CESAnalysis from fit environment.
  rm('cesa', envir = environment(fit))
  
  return(list(summary = variant_ep_results, fit = fit))
}
Townsend-Lab-Yale/cancereffectsizeR documentation built on Feb. 15, 2025, 10:23 a.m.