knitr::opts_chunk$set(echo = TRUE, fig.path = 'figures/') library(tidyverse) library(cowplot) library(rmutil) library(rstan) library(devtools) load_all('..')
mutations_raw <- read_tsv('stan_data.tsv') mutations <- mutations_raw %>% mutate( f_m = (alt_depth * (tumour_content * tumour_copy + (1 - tumour_content) * normal_copy)) / (total_depth * tumour_content) ) print(mutations) rcat <- function(n, p) { apply(rmultinom(n, 1, p), 2, function(z) {which(z == 1)}) }
mutations %>% ggplot(aes( x = f_m )) + geom_histogram()
Calculating BIC from MCMC output
compute_likelihood <- function(successes, trials, correction, clone_prop, mu, kappa) { tibble( idx = 1:length(mu), mu, clone_prop ) %>% plyr::ddply(c('idx', 'clone_prop'), function(z) { print(length(correction * z$mu)) likelihoods = rmutil::dbetabinom(successes, trials, correction * z$mu, kappa) return(data.frame( mutation_id = 1:length(successes), likelihoods )) }) %>% group_by(mutation_id) %>% summarise( likelihood = log(sum(likelihoods * clone_prop)) ) %>% ungroup() %>% .$likelihood %>% sum } mcmc_out <- readRDS('~/1_clusters.Rds') mcmc_chains <- mcmc_out %>% as.array %>% plyr::adply(2, function(z) { as_tibble(z) }) %>% mutate( iteration = row_number(), chain = factor(chains) %>% as.integer ) %>% select(-chains) %>% as_tibble() %>% gather(parameter, value, -chain, -iteration) mcmc_mu <- mcmc_chains %>% group_by(chain, iteration) %>% mutate( parameter_name = gsub('(.*?)\\[.*?\\].*', '\\1', parameter) ) %>% filter(parameter_name == 'clone_prop' | parameter_name == 'mu') %>% mutate( parameter_index = gsub('.*?\\[(.*?)\\].*', '\\1', parameter) %>% as.numeric ) %>% select(-parameter) %>% arrange(parameter_name, parameter_index) %>% mutate( parameter_index = rep(order(value[parameter_name == 'clone_prop']), 2) ) %>% ungroup() %>% arrange(chain, iteration, parameter_name, parameter_index) mcmc_mu %>% ggplot(aes( x = value )) + geom_histogram() + facet_grid(parameter_name ~ parameter_index) mu_estimate = mcmc_mu %>% filter(parameter_name == 'mu') %>% group_by(parameter_name, parameter_index) %>% summarise( mean = mean(value) ) %>% arrange(parameter_index) %>% .$mean clone_estimate = mcmc_mu %>% filter(parameter_name == 'clone_prop') %>% group_by(parameter_name, parameter_index) %>% summarise( mean = mean(value) ) %>% arrange(parameter_index) %>% .$mean kappa_estimate = mcmc_chains %>% filter(parameter == 'kappa_minus_two') %>% .$value %>% mean + 2 log_lik = with( mutations, compute_likelihood( successes = alt_depth, trials = total_depth, correction = tumour_content / ((tumour_content * tumour_copy) + ((1 - tumour_content) * normal_copy)), clone_prop = clone_estimate, mu = mu_estimate, kappa = kappa_estimate ) ) n = nrow(mutations) k = length(mu_estimate) bic = log(n) * k - ( 2 * log_lik ) print(bic)
Let's attempt EM
library(purrr) fit_beta_binom_mle <- function(alt_depth, total_depth, correction) { log_likelihood <- function(mu, kappa) { -sum(rmutil::dbetabinom(alt_depth, total_depth, mu * correction, kappa, log = TRUE)) } m <- stats4::mle( log_likelihood, start = list(mu = 0.5, kappa = 10), method = "L-BFGS-B", lower = c(0.001, 0.001), upper = c(1, Inf) ) ab <- stats4::coef(m) data_frame(mu = ab[1], kappa = ab[2], number = length(alt_depth)) } iterate_em <- function(state, ...) { fits <- state$assignments %>% group_by(cluster) %>% do(mutate(fit_beta_binom_mle(.$alt_depth, .$total_depth, .$correction), number = nrow(.))) %>% ungroup() %>% mutate(prior = number / sum(number)) assignments <- assignments %>% select(-cluster) %>% crossing(fits) %>% mutate(likelihood = rmutil::dbetabinom(alt_depth, total_depth, mu * correction, kappa)) %>% group_by(mutation_id) %>% top_n(1, likelihood) %>% ungroup() list(assignments = assignments, fits = fits) } n_cluster = 10 init = mutations %>% mutate( mutation_id = row_number(), cluster = rcat(n(), rep(1, n_cluster) / n_cluster), correction = tumour_content / ((tumour_content * tumour_copy) + ((1 - tumour_content) * normal_copy)) ) assignments <- init iterations <- accumulate(1:10, iterate_em, .init = list(assignments = init)) fit_iterations <- iterations %>% map_df("fits", .id = "iteration") %>% mutate(iteration = as.numeric(iteration)) fit_iterations %>% gather(parameter, estimate, mu, kappa) %>% ggplot(aes( x = iteration, y = estimate, colour = cluster, group = cluster )) + geom_line() + facet_grid(. ~ parameter) fit <- iterations[[10]]$fits out <- iterations[[10]]$assignments bic_df <- out %>% mutate( likelihood = dbetabinom(alt_depth, total_depth, mu * correction, kappa, log = TRUE) ) n = nrow(bic_df) k = bic_df$cluster %>% unique %>% length L = bic_df$likelihood %>% sum BIC = -2 * L + ( k * log(n)) print(BIC)
f_m_true = c(0.44986, 0.96522) clone_prop = c(0.35836, 0.64164) dispersion = 1/0.01582 z = rcat(dim(mutations)[1], clone_prop) f_m = sapply(z, function(i) {f_m_true[i]}) a = with(mutations, tumour_content / (tumour_content * tumour_copy + (1 - tumour_content) * normal_copy)) binom_prob = f_m * a x = sapply(1:dim(mutations)[1], function(n) { rmutil::rbetabinom(n = 1, size = mutations$total_depth[n], m = binom_prob[n], s = 600) }) simulated_var_reads <- tibble(simulated_var = x/ mutations$total_depth) %>% ggplot(aes( x = simulated_var )) + geom_histogram() actual_var_reads <- mutations %>% ggplot(aes( x = alt_depth / total_depth )) + geom_histogram() plot_grid( simulated_var_reads, actual_var_reads, nrow = 1, labels = c('Sim', 'Actual') )
dispersion = 1/0.01582 reference_signatures <- get_reference_signatures() signature_names <- reference_signatures %>% select(-mutation_type) %>% colnames S <- reference_signatures %>% select(-mutation_type) %>% as.matrix mutation_types <- reference_signatures$mutation_type f_m = c(0.2, 0.96522) # True values of f * m for each subpopulation phi_exposures = tibble( population_1 = c(0.1, 0.8, rep(0, 10), 0.1, rep(0, 17)), population_2 = c(0.1, 0.1, 0.8, rep(0, 26), 0) ) clone_prop = c(0.3, 0.7) phi <- t(t(phi_exposures) * clone_prop) %>% as_tibble %>% mutate(signature = factor(signature_names, levels = signature_names)) %>% gather(population, probability, -signature) %>% mutate(population = factor(population)) N = 10000 # Number of mutations latent = rcat(N, phi$probability) z <- phi[latent, ][['signature']] # Latent Signature u <- phi[latent, ][['population']] # Latent Population f_m_simulated = sapply(u %>% as.numeric, function(population_index) { f_m[population_index] }) tumour_content = 0.75 tumour_copy = 2 normal_copy = 2 a = tumour_content / (tumour_content * tumour_copy + (1 - tumour_content) * normal_copy) binom_prob = f_m_simulated * a # VAF simulation v = sapply(1:N, function(n) { rbetabinom(n = 1, size = 200, m = binom_prob[n], s = dispersion) }) # Mutation Class Simulation x = sapply(z %>% as.numeric, function(signature_index) { sig_probability_vector = S[, signature_index] rcat(1, sig_probability_vector) }) # Plots mutations <- tibble( mutation_type = reference_signatures[x, ][['mutation_type']], vaf = v ) %>% mutate( context = gsub('(.)\\[.>.\\](.)', '\\1\\2', mutation_type), transition = gsub('.\\[(.>.)\\].', '\\1', mutation_type) ) mutations %>% filter(vaf > 50) %>% ggplot(aes( x = context )) + geom_bar() + facet_grid(. ~ transition) + theme( axis.text.x = element_text(angle = 90, size = 6) ) mutations %>% ggplot(aes( x = vaf )) + geom_histogram()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.