R/simple_moi_simulation.R

Defines functions sampleMM simulateMOI

Documented in sampleMM simulateMOI

#' Simple simulation of MOI SNV data
#' 
#' @description Generate read count data in support of SNVs partioned by 
#' mixtures of clones within a host. This is done by setting true
#' clonal mixture proportions and the underlying sampling probabilities
#' for each clone. Then a random sample of underlying SNV probabilities
#' is sampled from either a rbeta or rtexp distrubition with given parameters
#' and read counts are generated by bionimial conditonal on the underlying
#' SNV probability with the clone sampling probabilities. 
#' 
#' @param n.samples number of infected hosts in population
#' @param n.snps number of SNPs observed
#' @param moi number of clonal infections
#' @param mean_coverage average coverage of reads
#' @param error_coverage deviation in coverage of reads
#' @param pi.true optional matrix of true mixture proportions
#' @param mu.true optional matrix of true mixture components
#' 
#' @return A list containing the following elements
#'  pi.true an n.samples by moi matrix containing true clonal proportions
#'  mu.true an n.samplps by moi matrix containing true genotype proportions
#'  aaf an n.snps vector of under SNV probabilities
#'  read.counts an n.samples by n.snps matrix containing read counts supporting each SNV
#'  error.counts an n.samples by n.snps matrix containing number of error reads 
#' @importFrom MCMCpack rdirichlet
#' @importFrom BiocParallel bplapply
#' @export
simulateMOI <- function(n.samples, n.snps, moi, mean_coverage, error_coverage,
                         pi.true = NULL, mu.true = NULL) {
    # I/0 checking
    if (moi < 2 || moi > 5) {
        stop("Number of infections must be between 2 and 5, inclusive")
    }
    
    
    
    # check clonal mixture matrix
    if(!is.null(pi.true)) {
        if(!inherits(pi.true, "matrix")) {
            stop("pi.true must be a matrix.")
        }
        
        if(!all(dim(pi.true) == c(moi, n.samples))) {
            stop("pi.true must have dimensions moi by n.samples")
        }
        
        check.sums <- colSums(pi.true)
        if(!all.equal(sum(check.sums), n.samples)) {
            stop("Columns of pi.true must sum to 1.")
        }
        
    }
    
    # check genotype proportion matrix
    if(!is.null(mu.true)) {
        if(!inherits(mu.true, "matrix")) {
            stop("mu.true must be a matrix")
        }
        
        if(!all(dim(mu.true) == c(moi, n.samples))) {
            stop("mu.true must have dimensions moi by n.samples")
        }
        
        if(any(mu.true < 0) || any(mu.true > 1)) {
            stop("All entries of mu.true must be between 0 and 1.")
        }
    }
    
    
    # step 1, generate underlying parameters from dirichlet prior
    if (is.null(pi.true)) {
        pi.true <- t(rdirichlet(n.samples, 
                                          alpha = 2^(1:moi)))
        
    }
    
    if (is.null(mu.true)) {
        mu.true <- t(rdirichlet(n.samples, 
                                          alpha = 2^(1:moi)))
    }
    # generate mixture indexes for each SNV for each isolate
    # produce an n.samples by n.snps matrix with assignments
    # i.e. underlying haplotypes here for major clone
    clusters <- bplapply(1:n.samples, function(i) {
        rmultinom(n.snps, size = 1, prob = pi.true[,i])

    })
    # generate underlying SNV frequenciences
    # use rtexp with fixed lambda to three (lots of rare variants)
    aaf <- rtexp(n.snps, 3 , 1)
    # generate observed coverage distribution from binomial
    coverage <- rbinom(n.snps, mean_coverage, prob = 1 - error_coverage)
    
    # conditional probabilities of observing each SNV given underlying
    # clonal genotype
    # generate matrix of assignments
    sample.reads.alt <- bplapply(1:n.samples,
                             function(i) {
                                 cl <- apply(clusters[[i]], 2, which.max)
                                 mu <- mu.true[,i]
                                 rbinom(n.snps, 
                                        size = coverage, 
                                        prob = mu[cl])
                             })
    
    # generate whether SNV is observed within a clone using
    # based on overall coverage and alternate allele frequency
    snv.observed <- bplapply(1:n.samples, function(i) {
        sapply(aaf, rbinom, n = moi, size = 1)
    })
    
    # generate observed read counts in support of each SNV for each clone
    # this is generated by whether a clone carries the variant 
    # multiplied by the number of reads within the clone and multiplied
    # by wheter the variant is observed
    read.counts.by.clone <- bplapply(1:n.samples, function(i) {
      snv.observed[[i]] * sample.reads.alt[[i]] }
    )
        
    read.counts.all <- bplapply(read.counts.by.clone, colSums)
    
    
    
    return(list(pi.true = pi.true, 
                mu.true = mu.true,
                coverage = coverage,
                moi = clusters,
                aaf = aaf,
                read.counts.by.clone = read.counts.by.clone,
                read.counts.all = read.counts.all,
                snv.observed = snv.observed))
    
}

#' Generate iid  binomial mixture random variables 
#' for testing EM implementation
#' 
#' @param n number of realisations
#' @param N number of trials
#' @param k number of components
#' @param mu true mixture components
#' @param pi true mixture weights
sampleMM <- function(n, N, k, mu, pi) {
    # assert that pi and mu must have length k
    stopifnot(length(pi) == k)
    stopifnot(length(mu) == k)
    # generate true hidden states
    states <- sample.int(k, size = n, replace = TRUE, prob = pi)
    # sample from binomial according to states
    observations <- rbinom(n, size = N, prob = mu[states])
    list(obs = observations, states = states)
}
bahlolab/moimix documentation built on April 16, 2020, 8:40 a.m.