MPRAnalyze: Statistical Analysis of MPRA data

Documented in simulateMPRA

#' Simulate an MPRA dataset
#' @param tr a vector of the true transcription rates, in log scale. The length 
#' of the vector determines the number of enhancers included in the dataset. 
#' Default is 100 enhancers of identical transcription rate of 2.
#' @param da a vector determinig differential activity. Values are assumed to be
#' in log scale, and will be used in the model as log Fold-Change values. If 
#' NULL (default) a single condition is simulated.
#' @param dna.noise.sd level of noise to add to the DNA library
#' @param rna.noise.sd level of noise to add to the RNA library
#' @param dna.inter the baseline DNA levels (intercept term), controlling the 
#' true mean abundance of plasmids
#' @param dna.inter.sd the true variation of the plasmid levels
#' @param nbc number of unique barcode to include per enhancer
#' @param coef.bc.sd true variation between barcodes
#' @param nbatch number of batches to simulate
#' @param coef.batch.sd the level of true variation that distinguishes batches 
#' (the size of the batch effects)
#' 
#' @return a list: 
#' \itemize{
#'     \item true.dna The true dna abundances
#'     \item obs.dna the observed dna counts
#'     \item true.rna the true rna abundances
#'     \item obs.rna the observed rna counts
#'     \item annot the annotations data.frame for each sample
#' }
#' 
#' @details the data is generated by using the same nested-GLM construct that
#' MPRAnalyzes uses, with non-strandard log-normal noise models (whereas by 
#' default MPRAnalyze uses a Gamma-Poisson model).
#' The data generated can have multiple batches, and either 1 or 2 conditions,
#' and the simulated data is always paired (DNA and RNA extracted from the same
#' library).
#' User can control both true and observed variation levels (noise), the number
#' of expected plasmids per barcode, the true transcription ratio, the size
#' of the batch and barcode effects. 
#' 
#' @export
#' 
#' @examples
#' # single condition
#' data <- simulateMPRA()
#' # two conditions
#' data <- simulateMPRA(da=c(rep(-0.5, 50), rep(0.5, 50)))
#' # more observed noise
#' data <- simulateMPRA(dna.noise.sd = 0.75, rna.noise.sd = 0.75)
#' # gradually increasing dataset
#' data <- simulateMPRA(tr = seq(2,3,0.01), da=NULL)
simulateMPRA <- function(tr = rep(2, 100), 
                         da = NULL,
                         dna.noise.sd = 0.2, rna.noise.sd = 0.3,
                         dna.inter = 5, dna.inter.sd = 0.5,
                         nbc = 100, coef.bc.sd = 0.5,
                         nbatch = 3, coef.batch.sd = 0.5) {
    nenhancer = length(tr)
    stopifnot(is.null(da) | length(tr) == length(da))
    stopifnot(is.null(da) | nbc > 1 | nbatch > 1)
    
    ## generate coefficient matrix for DNA
    coef.dna.bc <- matrix(rnorm(nenhancer * (nbc - 1), 
                                mean = 0, 
                                sd = coef.bc.sd), nrow = nenhancer)
    
    coef.dna.batch <- matrix(rnorm(nenhancer * (nbatch - 1), 
                                   mean = 0, 
                                   sd = coef.batch.sd), nrow = nenhancer)
    
    coef.dna.intercept <- matrix(rnorm(nenhancer, mean = dna.inter, 
                                       sd = dna.inter.sd), ncol = 1)
    
    coef.dna.mat <- cbind(coef.dna.intercept, coef.dna.batch, coef.dna.bc)
    
    if (is.null(da)) {
        cond.fact <- factor("reference")
    } else {
        cond.fact <- factor(c("reference", "contrast"), 
                            levels = c("reference", "contrast"))
    }
    ##  create corresponding annotation data.frame 
    annot <- expand.grid(barcode = factor(seq_len(nbc)),
                         batch = factor(seq_len(nbatch)),
                         condition = factor(cond.fact))
    
    # create the design matrix for the DNA model
    form <- "~ 1"
    if (nbatch > 1) {form <- c(form, "batch")}
    if (nbc > 1) {form <- c(form, "barcode")}
    
    desmat.dna <- model.matrix(as.formula(paste0(form, collapse = "+")), annot)

    ## get true counts by multiplying random coefficients with design matrix
    true.dna.log <-  coef.dna.mat %*% t(desmat.dna)
    ## add noise to dna observations
    obs.dna.log <- matrix(rnorm(n = prod(dim(true.dna.log)), 
                                mean = true.dna.log, sd = dna.noise.sd), 
                          nrow = NROW(true.dna.log))
   
    ## create RNA model components
    coef.rna.mat <- cbind(tr, da)
    
    form <- if (is.null(da)) ~ 1 else ~ condition
    desmat.rna <- model.matrix(form, annot)
    ## get true counts by adding the log-alpha to the dna counts
    true.rna.log <- true.dna.log + (coef.rna.mat %*% t(desmat.rna))
    
    ## add noise to rna observations
    obs.rna.log <- matrix(rnorm(n = prod(dim(true.rna.log)),
                                mean = true.rna.log, sd = rna.noise.sd), 
                          nrow = NROW(true.rna.log))
    
    true.dna <- exp(true.dna.log)
    obs.dna <- exp(obs.dna.log)
    count.dna <- round(obs.dna)
    
    true.rna <- exp(true.rna.log)
    obs.rna <- exp(obs.rna.log)
    count.rna <- round(obs.rna)
    
    rownames(true.dna) <- rownames(count.dna) <- rownames(true.rna) <- 
        rownames(count.rna) <- paste0("enh_", seq_len(nenhancer))
    
    colnames(true.dna) <- colnames(count.dna) <- colnames(true.rna) <-
        colnames(count.rna) <- rownames(annot) <-
        paste0(annot$condition, "_b", annot$batch, "_bc", annot$barcode)
    
    return(list(true.dna = true.dna, obs.dna = count.dna, 
                true.rna = true.rna, obs.rna = count.rna, 
                annot = annot))
}