#' SignIT-Pop inference of populations and signatures
#'
#' Jointly infers mutational subpopulations and their associated mutation signature exposures
#'
#' \code{get_population_signatures} is the central function which facilitates Bayesian inference
#' of mutational populations and signatures. This model infers a matrix of L x N parameters,
#' where L is the number of populations and N is the number of signatures. The posterior distribution
#' of each parameter is estimated using either automatic differentiation variational inference
#' or Hamiltonial Monte Carlo using the \code{\link[rstan]{vb}} and \code{\link[rstan]{sampling}}
#' methods respectively of the \pkg{rstan} package (an interface to the Stan probabilistic
#' programming language).
#'
#' @param mutation_table Table of mutations, one per row. The minimum input requires the
#' following columns: \itemize{
#' \item \strong{total_depth}: Total number of reads covering mutated locus.
#' \item \strong{alt_depth}: Total number of mutant reads covering locus.
#' \item \strong{tumour_copy}: Tumour copy number at the mutated locus
#' \item \strong{normal_copy}: Normal copy number at the mutated locus
#' \item \strong{tumour_content}: Estimated tumour content as a fraction
#' between 0 and 1. Must be the same value
#' throughout the whole table.
#' }
#'
#' @param reference_signatures Reference mutation signatures. This can either be from
#' \code{\link{get_reference_signatures}} or a custom data frame formatted
#' equivalently.
#'
#' @param subset_signatures Boolean. If TRUE (default), then \code{\link{subset_reference_signatures}}
#' is run to pre-select a smaller subset of signatures most likely to be active
#' in the cancer. This helps to reduce processing time and model complexity, but
#' may bias the result.
#'
#' @param n_populations The number of populations to screen for. Must be an integer. If no value is
#' provided, then a model selection step is engaged to automatically estimate
#' the number of populations. The automatic model selection uses
#' \code{\link{select_n_populations}}, which performs a maximum a posteriori
#' estimate using the SignIT population model (without mutation signature inference).
#'
#' @param genome A BSgenome object. This is used to determine trinucleotide contexts of mutations
#' to define mutation types. By default, uses BSgenome.Hsapiens.UCSC.hg19. To
#' define custom mutation types, simply include a column named \code{mutation_type} in
#' \code{mutation_table}, in which case this parameter is ignored.
#'
#' @param method The posterior sampling method. This is a string and can either be 'vb' for
#' automatic variational Bayes or 'mcmc' for Hamiltonial Monte Carlo.
#'
#' @param n_chains Number of chains to sample. Only relevant if \code{method == 'mcmc'}.
#'
#' @param n_cores Number of cores for parallel sampling. By default this equals the number of chains.
#' Only relevant if \code{method == 'mcmc'}.
#'
#' @param n_iter Number of sampling iterations per chain. These are distinct from adaptation iterations,
#' so the total number of iterations will be \code{n_iter + n_adapt}.
#' Only relevant if \code{method == 'mcmc'}.
#'
#' @param n_adapt Number of adaptation iterations per chain. Only relevant if \code{method == 'mcmc'}.
#'
#' @return A list object with the posterior sampling of population signatures plus relevant input and metadata.
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom BSgenome getBSgenome
#' @import rstan
#'
#' @export
get_population_signatures <- function(
mutation_table,
reference_signatures = NULL,
subset_signatures = TRUE,
n_populations = NULL,
genome = NULL,
method = 'vb',
n_chains = 10,
n_cores = 1,
n_iter = 300,
n_adapt = 200,
prevalences = NULL
) {
if (get_os() == 'windows' && n_cores > 1) {
stop("Multicore processing is not available on Windows. Please leave n_cores = 1")
}
stopifnot(
length(prevalences) == n_populations || is.null(prevalences) || is.null(n_populations),
is.null(prevalences) || all(prevalences < 1)
)
if (is.null(genome)) {
genome = getBSgenome('BSgenome.Hsapiens.UCSC.hg19')
}
mutation_table <- mutation_table %>%
distinct() %>%
filter(
!is.na(chr),
!is.na(pos),
!is.na(ref),
!is.na(alt),
!is.na(total_depth),
!is.na(alt_depth),
!is.na(normal_copy),
!is.na(tumour_content)
) %>%
mutate(
correction = get_vaf_correction(.)
)
if ('mutation_type' %in% colnames(mutation_table)) {
message("Using pre-specified mutation types.")
} else {
mutation_table <- mutation_table %>%
mutate(
mutation_type = get_snv_mutation_type(
chr, pos, ref, alt, genome
)
) %>%
filter(
! grepl('N', mutation_type)
)
}
n_mutations <- dim(mutation_table)[1]
stopifnot(
mutation_table$tumour_content %>% unique %>% length == 1
)
tumour_content <- mutation_table$tumour_content[1]
catalog <- with(
mutation_table,
mutations_to_catalog(chr, pos, ref, alt)
)
if (is.null(reference_signatures)) {
reference_signatures <- get_reference_signatures()
}
if (subset_signatures) {
message('Subsetting reference signatures on request')
reference_signatures <- subset_reference_signatures(catalog, reference_signatures, n_cores = n_cores)
}
reference_signatures <- normalize_reference_signatures(reference_signatures)
if (is.null(prevalences)) {
if (is.null(n_populations)) {
message('No number of populations provided. Will run automatic model selection.')
n_population_selection <- select_n_populations(mutation_table)
n_populations <- n_population_selection$optimal_n_populations
mu <- n_population_selection$evaluated_models[[n_populations]]$parameter_estimates$mu
kappa <- n_population_selection$evaluated_models[[n_populations]]$parameter_estimates$kappa
message(sprintf('Engaged auto-selection of clusters. Chose a model with %s clusters.', n_populations))
} else {
message(sprintf('Extracting parameters from population model with %s populations', n_populations))
n_population_selection <- NULL
}
message(sprintf(
'Running model with provided reference signatures: %s signatures, %s mutations, and %s clusters',
reference_signatures %>% select(-mutation_type) %>% ncol,
mutation_table %>% nrow,
n_populations
))
stan_dso = stanmodels$signit_model_infer_populations
stan_data = list(
N = nrow(mutation_table),
# Signature-specific data
K = reference_signatures %>% select(-mutation_type) %>% dim %>% .[2],
R = reference_signatures %>% dim %>% .[1],
v = factor(mutation_table$mutation_type, levels = reference_signatures$mutation_type) %>% as.numeric,
ref_signatures = reference_signatures %>% select(-mutation_type) %>% t,
# Clonality-specific data
L = n_populations,
x = as.numeric(mutation_table$alt_depth),
d = as.numeric(mutation_table$total_depth),
a = as.numeric(mutation_table$correction)
)
} else {
message(sprintf('Using predetermined population prevalences: %s', paste(prevalences, ', ')))
n_population_selection <- NULL
n_populations <- length(prevalences)
message(sprintf(
'Running model with provided reference signatures: %s signatures, %s mutations, and %s clusters',
reference_signatures %>% select(-mutation_type) %>% ncol,
mutation_table %>% nrow,
n_populations
))
stan_dso = stanmodels$signit_model
stan_data = list(
N = nrow(mutation_table),
# Signature-specific data
K = reference_signatures %>% select(-mutation_type) %>% dim %>% .[2],
R = reference_signatures %>% dim %>% .[1],
v = factor(mutation_table$mutation_type, levels = reference_signatures$mutation_type) %>% as.numeric,
ref_signatures = reference_signatures %>% select(-mutation_type) %>% t,
# Clonality-specific data
L = n_populations,
x = as.numeric(mutation_table$alt_depth),
d = as.numeric(mutation_table$total_depth),
a = as.numeric(mutation_table$correction),
mu = as.numeric(prevalences)
)
}
print(stan_data)
message('Sampling')
parameter_dimension <- stan_data$L * stan_data$K
if (method == 'mcmc') {
stan_fit = sampling(
object = stan_dso,
data = stan_data,
chains = n_chains,
iter = n_iter + n_adapt,
warmup = n_adapt,
cores = n_cores
)
} else if (method == 'vb') {
stan_fit = vb(
object = stan_dso,
data = stan_data
)
} else {
stop('Method argument must be either "mcmc" or "vb"')
}
output <- list(
mutation_table = mutation_table,
reference_signatures = reference_signatures,
n_populations = n_populations,
mcmc_output = stan_fit
)
output['model_selection_data'] = n_population_selection
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.