#' Simple simulation of MOI SNV data
#'
#' @description Generate read count data in support of SNVs partioned by
#' mixtures of clones within a host. This is done by setting true
#' clonal mixture proportions and the underlying sampling probabilities
#' for each clone. Then a random sample of underlying SNV probabilities
#' is sampled from either a rbeta or rtexp distrubition with given parameters
#' and read counts are generated by bionimial conditonal on the underlying
#' SNV probability with the clone sampling probabilities.
#'
#' @param n.samples number of infected hosts in population
#' @param n.snps number of SNPs observed
#' @param moi number of clonal infections
#' @param mean_coverage average coverage of reads
#' @param error_coverage deviation in coverage of reads
#' @param pi.true optional matrix of true mixture proportions
#' @param mu.true optional matrix of true mixture components
#'
#' @return A list containing the following elements
#' pi.true an n.samples by moi matrix containing true clonal proportions
#' mu.true an n.samplps by moi matrix containing true genotype proportions
#' aaf an n.snps vector of under SNV probabilities
#' read.counts an n.samples by n.snps matrix containing read counts supporting each SNV
#' error.counts an n.samples by n.snps matrix containing number of error reads
#' @importFrom MCMCpack rdirichlet
#' @importFrom BiocParallel bplapply
#' @export
simulateMOI <- function(n.samples, n.snps, moi, mean_coverage, error_coverage,
pi.true = NULL, mu.true = NULL) {
# I/0 checking
if (moi < 2 || moi > 5) {
stop("Number of infections must be between 2 and 5, inclusive")
}
# check clonal mixture matrix
if(!is.null(pi.true)) {
if(!inherits(pi.true, "matrix")) {
stop("pi.true must be a matrix.")
}
if(!all(dim(pi.true) == c(moi, n.samples))) {
stop("pi.true must have dimensions moi by n.samples")
}
check.sums <- colSums(pi.true)
if(!all.equal(sum(check.sums), n.samples)) {
stop("Columns of pi.true must sum to 1.")
}
}
# check genotype proportion matrix
if(!is.null(mu.true)) {
if(!inherits(mu.true, "matrix")) {
stop("mu.true must be a matrix")
}
if(!all(dim(mu.true) == c(moi, n.samples))) {
stop("mu.true must have dimensions moi by n.samples")
}
if(any(mu.true < 0) || any(mu.true > 1)) {
stop("All entries of mu.true must be between 0 and 1.")
}
}
# step 1, generate underlying parameters from dirichlet prior
if (is.null(pi.true)) {
pi.true <- t(rdirichlet(n.samples,
alpha = 2^(1:moi)))
}
if (is.null(mu.true)) {
mu.true <- t(rdirichlet(n.samples,
alpha = 2^(1:moi)))
}
# generate mixture indexes for each SNV for each isolate
# produce an n.samples by n.snps matrix with assignments
# i.e. underlying haplotypes here for major clone
clusters <- bplapply(1:n.samples, function(i) {
rmultinom(n.snps, size = 1, prob = pi.true[,i])
})
# generate underlying SNV frequenciences
# use rtexp with fixed lambda to three (lots of rare variants)
aaf <- rtexp(n.snps, 3 , 1)
# generate observed coverage distribution from binomial
coverage <- rbinom(n.snps, mean_coverage, prob = 1 - error_coverage)
# conditional probabilities of observing each SNV given underlying
# clonal genotype
# generate matrix of assignments
sample.reads.alt <- bplapply(1:n.samples,
function(i) {
cl <- apply(clusters[[i]], 2, which.max)
mu <- mu.true[,i]
rbinom(n.snps,
size = coverage,
prob = mu[cl])
})
# generate whether SNV is observed within a clone using
# based on overall coverage and alternate allele frequency
snv.observed <- bplapply(1:n.samples, function(i) {
sapply(aaf, rbinom, n = moi, size = 1)
})
# generate observed read counts in support of each SNV for each clone
# this is generated by whether a clone carries the variant
# multiplied by the number of reads within the clone and multiplied
# by wheter the variant is observed
read.counts.by.clone <- bplapply(1:n.samples, function(i) {
snv.observed[[i]] * sample.reads.alt[[i]] }
)
read.counts.all <- bplapply(read.counts.by.clone, colSums)
return(list(pi.true = pi.true,
mu.true = mu.true,
coverage = coverage,
moi = clusters,
aaf = aaf,
read.counts.by.clone = read.counts.by.clone,
read.counts.all = read.counts.all,
snv.observed = snv.observed))
}
#' Generate iid binomial mixture random variables
#' for testing EM implementation
#'
#' @param n number of realisations
#' @param N number of trials
#' @param k number of components
#' @param mu true mixture components
#' @param pi true mixture weights
sampleMM <- function(n, N, k, mu, pi) {
# assert that pi and mu must have length k
stopifnot(length(pi) == k)
stopifnot(length(mu) == k)
# generate true hidden states
states <- sample.int(k, size = n, replace = TRUE, prob = pi)
# sample from binomial according to states
observations <- rbinom(n, size = N, prob = mu[states])
list(obs = observations, states = states)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.