R/simulateMPRA.R
In MPRAnalyze: Statistical Analysis of MPRA data

Documented in simulateMPRA

#' Simulate an MPRA dataset
#' @param tr a vector of the true transcription rates. The length of the vector 
#' determines the number of enhancers included in the dataset Default is 100
#' enhancers of identical transcription rate of 2.
#' @param da a vector of the differential activity signal. Must be the same 
#' length as tr. if NULL (default) only one condition is simulated, with no 
#' differential activity. The values provided are used as the logFold Change
#' between the conditions, treating the tr vector as the reference condition.
#' For non-differentially active enhancers, this value should be 0.
#' @param dna.noise.sd level of noise to add to the DNA library
#' @param rna.noise.sd level of noise to add to the RNA library
#' @param dna.inter the baseline DNA levels (intercept term), controlling the 
#' true mean abundance of plasmids
#' @param dna.inter.sd the true variation of the plasmid levels
#' @param nbc number of unique barcode to include per enhancer
#' @param coef.bc.sd true variation between barcodes
#' @param nbatch number of batches to simulate
#' @param coef.batch.sd the level of true variation that distinguishes batches 
#' (the size of the batch effects)
#' 
#' @return a list: 
#' \itemize{
#'     \item true.dna The true dna abundances
#'     \item obs.dna the observed dna counts
#'     \item true.rna the true rna abundances
#'     \item obs.rna the observed rna counts
#'     \item annot the annotations data.frame for each sample
#' }
#' 
#' @details the data is generated by using the same nested-GLM construct that
#' MPRAnalyzes uses, with non-strandard log-normal noise models (whereas by 
#' default MPRAnalyze uses a Gamma-Poisson model).
#' The data generated can have multiple batches, and either 1 or 2 conditions,
#' and the simulated data is always paired (DNA and RNA extracted from the same
#' library).
#' User can control both true and observed variation levels (noise), the number
#' of expected plasmids per barcode, the true transcription ratio, the size
#' of the batch and barcode effects. 
#' 
#' @export
#' 
#' @examples
#' 
#' data <- simulateMPRA()
#' # single condition
#' data <- simulateMPRA(da=NULL)
#' # more observed noise
#' data <- simulateMPRA(dna.noise.sd = 0.75, rna.noise.sd = 0.75)
#' # gradually increasing dataset
#' data <- simulateMPRA(tr = seq(2,3,0.01), da=NULL)
simulateMPRA <- function(tr = rep(2, 100), 
                         da = c(rep(0, ceiling(length(tr) / 2)), 
                                rep(0.5, floor(length(tr) / 2))),
                         dna.noise.sd = 0.2, rna.noise.sd = 0.3,
                         dna.inter = 5, dna.inter.sd = 0.5,
                         nbc = 100, coef.bc.sd = 0.5,
                         nbatch = 3, coef.batch.sd = 0.5) {
    nenhancer = length(tr)
    stopifnot(is.null(da) | length(tr) == length(da))
    
    ## generate coefficient matrix for DNA
    coef.dna.bc <- matrix(rnorm(nenhancer * (nbc - 1), 
                                mean = 0, 
                                sd = coef.bc.sd), nrow = nenhancer)
    
    coef.dna.batch <- matrix(rnorm(nenhancer * (nbatch - 1), 
                                   mean = 0, 
                                   sd = coef.batch.sd), nrow = nenhancer)
    
    coef.dna.intercept <- matrix(rnorm(nenhancer, mean = dna.inter, 
                                       sd = dna.inter.sd), ncol = 1)
    
    coef.dna.mat <- cbind(coef.dna.intercept, coef.dna.batch, coef.dna.bc)
    
    ##  create corresponding annotation data.frame and design matrix
    annot <- expand.grid(barcode = factor(seq_len(nbc)),
                         batch = factor(seq_len(nbatch)))
        
    if(nbatch > 1 & nbc > 1) {
        desmat <- model.matrix(~ batch + barcode, annot)
    } else if (nbatch > 1) {
        desmat <- model.matrix(~ batch, annot)
    } else if (nbc > 1) {
        desmat <- model.matrix(~ barcode, annot)
    } else {
        stop("Specified design is too restricted")
    }
    
    ## get true counts by multiplying random coefficients with design matrix
    true.dna.log <-  coef.dna.mat %*% t(desmat)
    ## add noise to dna observations
    obs.dna.log <- matrix(rnorm(n = prod(dim(true.dna.log)), 
                                mean = true.dna.log, sd = dna.noise.sd), 
                          nrow = NROW(true.dna.log))
    ## if differential activity is needed, use same coefficients to generate
    ## another set of observations (different noise)
    if (!is.null(da)) {
        obs.dna.log.diff <- matrix(rnorm(n = prod(dim(true.dna.log)), 
                                        mean = true.dna.log, sd = dna.noise.sd),
                              nrow = NROW(true.dna.log))
        obs.dna.log <- cbind(obs.dna.log, obs.dna.log.diff)
        
        annot.diff <- annot
        annot$condition <- "reference"
        annot.diff$condition <- "contrast"
        annot <- rbind(annot, annot.diff)
        annot$condition <- factor(annot$condition, 
                                  levels=c("reference", "contrast"))
    }
    
    # get the true RNA by multiplying the true DNA by the transcription rate
    true.rna.log <- true.dna.log + 
        matrix(rep(log(tr), NCOL(true.dna.log)), ncol=NCOL(true.dna.log))
    
    ## if differential activity, add differential observations
    if (!is.null(da)) {
        true.rna.log.diff <- true.dna.log + 
            matrix(rep(log(tr) + da, 
                       NCOL(true.dna.log)), 
                   ncol=NCOL(true.dna.log))
        true.rna.log <- cbind(true.rna.log, true.rna.log.diff)
        true.dna.log <- cbind(true.dna.log, true.dna.log)
    }
    
    obs.rna.log <- matrix(rnorm(n = prod(dim(true.rna.log)),
                                mean = true.rna.log, sd = rna.noise.sd), 
                          nrow = NROW(true.rna.log))
    
    true.dna <- exp(true.dna.log)
    obs.dna <- exp(obs.dna.log)
    count.dna <- round(obs.dna)
    
    true.rna <- exp(true.rna.log)
    obs.rna <- exp(obs.rna.log)
    count.rna <- round(obs.rna)
    
    rownames(true.dna) <- rownames(count.dna) <- rownames(true.rna) <- 
        rownames(count.rna) <- paste0("enh_", seq_len(nenhancer))
    
    colnames(true.dna) <- colnames(count.dna) <- colnames(true.rna) <-
        colnames(count.rna) <- rownames(annot) <-
        paste0(annot$condition, "_b", annot$batch, "_bc", annot$barcode)
    
    return(list(true.dna = true.dna, obs.dna = count.dna, 
                true.rna = true.rna, obs.rna = count.rna, 
                annot = annot))
}