R/priors.R

Defines functions .compute_priors .compute_mutation_prior .compute_context_prior .compute_global_prior

#' prior functions


# P(m|M,C) Given a context (including reference), and the fact that a mutation has
# occurred what is the probability that the mutation is to each of the three possible alternate bases.
# 96 rows, each (context,cref) combination (n=32) has three elements, and the sum of their probabilities
# is 1

.compute_priors <- function(vr,reference,prior_lod){
  prior_vars <- vr %>% dplyr::filter(mutect_odds >= prior_lod & pass_all)
  mutation_prior <- .compute_mutation_prior(prior_vars)
  context_prior <- .compute_context_prior(prior_vars)
  global_prior <- .compute_global_prior(reference)
  #P(M) = 3e-6 = mu and P(C) = 1/96 (there are 96 context) P(M)/P(C) = 96*3e-6 = 2.88e-4
  # full prior= mutation_prior * context_prior * P(M)/P(C)
  # 4/23/2018: Add the real global context prior. replacing P(C) = 1/96
  joint_prior <- mutation_prior %>%
    dplyr::left_join(context_prior,by=c('context','cref')) %>%
    dplyr::left_join(global_prior,by=c('context','cref')) %>%
    dplyr::mutate(joint_prior=((mutation_prior*context_prior)/global_prior)*3e-6) %>%
    dplyr::select(cref,calt,context,mutation_prior,joint_prior,global_prior)
  vr <- vr %>% dplyr::left_join(joint_prior,by=c('context','cref','calt')) %>%
    tidyr::replace_na(list(joint_prior=1e-6)) %>%
    dplyr::mutate(logit_prior=log10(joint_prior/(1-joint_prior)),
                  prior_odds=10**(TLOD+logit_prior))
  vr

}

.compute_mutation_prior <- function(vars) {
  vars %>% dplyr::group_by(context,cref,calt) %>%
  dplyr::summarise(n=n()) %>%
  dplyr::mutate(mutation_prior=n/sum(n)) %>%
  dplyr::select(-n)
}

.compute_context_prior <- function(vars){
  vars %>% dplyr::group_by(context,cref) %>%
    dplyr::summarise(n=n()) %>% dplyr::ungroup() %>%
    dplyr::mutate(context_prior=n/sum(n)) %>%
    dplyr::select(-n)
}

.compute_global_prior <- function(reference){
  seqs <- SomaticSignatures::kmerFrequency(reference,n=1e6,k=3)
  fr <- data.frame(ctxt=as.character(names(seqs)),global_prior=as.numeric(seqs))
  fr <- fr %>% dplyr::mutate(context=paste0(stringr::str_sub(ctxt,1L,1L),'.',stringr::str_sub(ctxt,-1L,-1L)),
                             cref=stringr::str_sub(ctxt,2L,2L))
  fr
}
bmannakee/smartcallr documentation built on May 3, 2019, 1:47 p.m.