###############################################################
## run simulation and DE detection
###############################################################
#' @name simulateDE
#' @aliases simulateDE
#' @title Simulate Differential Expression
#' @description This function simulates RNA-seq count matrices considering differential expression specifications (number of samples per group, effect size, number of differential expressed genes etc.). The return object contains DE test results from all simulations as well as descriptive statistics.
#' @details simulateDE is the main function to simulate differential expression for RNA-seq experiments. The simulation parameters are specified with \code{\link{SimSetup}}. The user needs to specify the number of samples per group and the differential expression analysis method. \cr
#' It only stores and returns the DE test results (i.e. p-values). The error matrix calculations will be conducted with \code{\link{evaluateSim}}.\cr
#' @usage simulateDE(n1=c(20,50,100),
#' n2=c(30,60,120),
#' sim.settings,
#' ncores=NULL,
#' DEmethod,
#' verbose=TRUE)
#' @param n1,n2 Integer vectors specifying the number of biological replicates in each group. Default values are n1=c(20,50,100) and n2=c(30,60,120).
#' @param sim.settings This object specifies the simulation setup. This must be the return object from \code{\link{SimSetup}}.
#' @param ncores integer positive number of cores for parallel processing, default is NULL, ie 1 core.
#' @param DEmethod String to specify the DE detection method to be used. Available options are: limma, edgeR, DESeq2, ROTS, baySeq, NOISeq, EBSeq, DSS, MAST, scde, BPSC, scDD.
#' @param verbose Logical value to indicate whether to show progress report of simulations. Default is TRUE.
#' @return A list with the following fields. The dimensions for the 3D arrays are ngenes * N * nsims:
#' \item{pvalue, fdr}{3D array for p-values and FDR from each simulation. Note that FDR values will be empty and the calculation will be done by \code{\link{evaluateSim}} whenever applicable.}
#' \item{mu,disp,dropout}{3D array for mean, dispersion and dropout of library size factor normalized read counts}
#' \item{n1,n2}{The input number of biological replicates in each group. The vectors must have the same length.}
#' \item{time.taken}{The time taken for each simulation, given for read count simulation, DEA and moment estimation. }
#' @author Beate Vieth
#' @seealso \code{\link{estimateNBParam}}, \code{\link{insilicoNBParam}} for negative binomial parameter specifications;\cr
#' \code{\link{DESetup}}, \code{\link{SimSetup}} for simulation setup
#' @examples
#' \dontrun{
#' # download count table
#' githubURL <- "https://github.com/bvieth/powsimRData/raw/master/data-raw/kolodziejczk_cnts.rda"
#' download.file(url = githubURL, destfile= "kolodziejczk_cnts.rda", method = "wget")
#' load('kolodziejczk_cnts.rda')
#' kolodziejczk_cnts <- kolodziejczk_cnts[, grep('standard', colnames(kolodziejczk_cnts))]
#' ## estimate NB parameters:
#' TwoiLIF.params = estimateNBParam(countData = kolodziejczk_cnts,
#' cData = NULL, design = NULL,
#' RNAseq = 'singlecell', estFramework = 'MatchMoments',
#' sigma= 1.96)
#' ## define DE settings:
#' desettings <- DESetup(ngenes=10000,
#' nsims=25, p.DE=0.2,
#' LFC=function(x) sample(c(-1,1), size=x,replace=TRUE)*rgamma(x, 3, 3))
#' ## define simulation settings for Kolodziejczk:
#' simsettings <- SimSetup(desetup=desettings, params=TwoiLIF.params, size.factors='given')
#' ## run simulations:
#' simres <- simulateDE(n1=c(24,48,96,192,384,800),
#' n2=c(24,48,96,192,384,800),
#' sim.settings=simsettings,
#' ncores=10, DEmethod="MAST", verbose=TRUE)
#' ## if parallel computation unavailable, consider ROTS as DEmethod
#' }
#' @rdname simulateDE
#' @export
simulateDE <- function(n1=c(20,50,100), n2=c(30,60,120), sim.settings, ncores=NULL, DEmethod, verbose=TRUE) {
if (!length(n1) == length(n2))
stop("n1 and n2 must have the same length!")
if(!is.null(ncores) && DEmethod %in% c("edgeR", 'limma', "NOISeq", "DSS", "EBSeq", "ROTS")) {
message(paste0(DEmethod, " has no parallel computation option. Number of cores will be set to 1!"))
}
if(sim.settings$RNAseq == "singlecell" && DEmethod %in% c("edgeR", "DESeq2", "baySeq", "NOISeq", "DSS", "EBSeq")) {
message(paste0(DEmethod, " is developed for bulk RNA-seq experiments."))
}
if(sim.settings$RNAseq == "bulk" && DEmethod %in% c("MAST", 'BPSC')) {
message(paste0(DEmethod, " is developed for single cell RNA-seq experiments."))
}
if(sim.settings$RNAseq == "bulk" && DEmethod %in% c('scde', 'scDD')) {
stop(cat(DEmethod, " is only developed and implemented for single cell RNA-seq experiments."))
}
max.n = max(n1,n2)
nsims = sim.settings$nsims
sim.settings$ncores = ncores
#set up output arrays
pvalues = fdrs = mus = disps = dropouts = array(NA,dim=c(sim.settings$ngenes,length(n1), nsims))
time.taken = array(NA,dim = c(3,length(n1), nsims), dimnames = list(c("params", "DE", "NB"), NULL, NULL))
## start simulation
for (i in 1:nsims) {
if (verbose)
message(paste0("Simulation number ", i, "\n"))
## update the simulation options by extracting the ith set and change sim.seed
tmp.simOpts = sim.settings
tmp.simOpts$DEid = tmp.simOpts$DEid[[i]]
tmp.simOpts$lfc = tmp.simOpts$lfc[[i]]
tmp.simOpts$sim.seed = tmp.simOpts$sim.seed[[i]]
## generate read counts
dat.sim = .simRNAseq(simOptions = tmp.simOpts, n1 = max.n, n2 = max.n)
## for different sample sizes
for (j in seq(along=n1)) {
Nrep1 = n1[j]
Nrep2 = n2[j]
## take a subsample of the simulated counts
idx = c(1:Nrep1, max.n + (1:Nrep2))
tmp.design = dat.sim$designs[idx]
tmp.cnts = dat.sim$counts[,idx]
## filter out zero expression genes
ix.valid = rowSums(tmp.cnts) > 0
tmp.cnts.red = tmp.cnts[ix.valid,, drop = FALSE]
## create an object to pass into DE detection with mean, disp, dropout estimation
data0 = list(counts = tmp.cnts.red,
designs = tmp.design,
p.DEs = tmp.simOpts$p.DE,
RNAseq = tmp.simOpts$RNAseq,
ncores = tmp.simOpts$ncores)
# methods developed for bulk
if (DEmethod == "edgeR")
res = .run.edgeRglm(data0)
# if (DEmethod == "edgeRQL")
# res = .run.edgeRql(data0)
if (DEmethod == "DESeq2")
res = .run.DESeq2(data0)
if (DEmethod == "limma")
res = .run.limma(data0)
if (DEmethod == "ROTS")
res = .run.ROTS(data0)
if (DEmethod == "baySeq")
res = .run.baySeq(data0)
if (DEmethod == "NOISeq")
res = .run.NOISeq(data0)
if (DEmethod == "DSS")
res = .run.DSS(data0)
if (DEmethod=='EBSeq')
res = .run.EBSeq(data0)
# if (DEmethod == "TSPM")
# res = .run.TSPM(data0)
# methods developed for single cell
if (DEmethod == "MAST")
res = .run.MAST(data0)
if (DEmethod == "scde")
res = .run.scde(data0)
if (DEmethod == "BPSC")
res = .run.BPSC(data0)
# if (DEmethod == "monocle")
# res = .run.monocle(data0)
if (DEmethod == "scDD")
res = .run.scDD(data0)
## extract results of DE testing
res.tmp = res$result
# generate empty vectors
pval = fdr = rep(NA, nrow(tmp.cnts))
mu.tmp = rep(NA, nrow(tmp.cnts))
disp.tmp = rep(NA, nrow(tmp.cnts))
p0.tmp = rep(NA, nrow(tmp.cnts))
# fill in with results
pval[ix.valid] = res.tmp[, "pval"]
fdr[ix.valid] = res.tmp[, "fdr"]
mu.tmp[ix.valid] = res.tmp$means
disp.tmp[ix.valid] = res.tmp$dispersion
p0.tmp[ix.valid] = res.tmp$dropout
# copy it in 3D array of results
pvalues[,j,i] = pval
fdrs[,j,i] = fdr
mus[,j,i] = mu.tmp
disps[,j,i] = disp.tmp
dropouts[,j,i] = p0.tmp
# copy time taken in 2D array of time taken
time.taken[,j,i] = res$timing
}
}
## return
list(pvalue = pvalues,
fdr = fdrs,
mu = mus,
disp = disps,
dropout = dropouts,
n1 = n1,
n2 = n2,
time.taken = time.taken,
DEmethod = DEmethod,
sim.settings = sim.settings)
}
#####################################################
## create simulated scRNAseq data for 2-grp comparison
#####################################################
#' @importFrom stats model.matrix
.simRNAseq <- function(simOptions, n1, n2) {
## make group labels
design <- c(rep(-1, n1), rep(1, n2))
## make model matrix
mod = stats::model.matrix(~-1 + design)
# generate read counts:
if (simOptions$RNAseq == "singlecell") {
simcounts = .scRNAseq_counts(sim.options = simOptions, modelmatrix = mod)
}
if (simOptions$RNAseq == "bulk") {
simcounts = .bulkRNAseq_counts(sim.options = simOptions, modelmatrix = mod)
}
simcounts <- apply(simcounts,2,function(x) {storage.mode(x) <- 'integer'; x})
colnames(simcounts) <- paste0("S", 1:ncol(simcounts))
rownames(simcounts) <- paste0("G", 1:nrow(simcounts))
## return
list(counts = simcounts, designs = design, simOptions = simOptions)
}
#####################################################
## draw read counts
#####################################################
#' @importFrom stats rnorm rnbinom
.scRNAseq_counts <- function(sim.options, modelmatrix) {
set.seed(sim.options$sim.seed)
print(sim.options$sim.seed)
nsamples = nrow(modelmatrix)
ngenes = sim.options$ngenes
mod = modelmatrix
beta = cbind(sim.options$lfc)
if (attr(sim.options, 'param.type') == 'estimated') {
# define NB params
mu = sim.options$means
meansizefit = sim.options$meansizefit
# sample from the mean parameter observed
index = sample(1:length(mu), size = ngenes, replace = T)
true.means = mu[index]
# estimate size parameter associated with mean values
lmu = log2(true.means + 1)
predsize.mean = approx(meansizefit$x, meansizefit$y, xout = lmu)$y
predsize.sd = approx(meansizefit$x, meansizefit$sd, xout = lmu)$y
sizevec = rnorm(n = length(lmu), mean = predsize.mean, sd = predsize.sd)
# capture efficiency (size factor)
if (sim.options$size.factors == "equal") {
all.facs <- rep(1, nsamples)
}
if (sim.options$size.factors == "given") {
if (is.function(sim.options$sf)) {
all.facs <- sim.options$sf(nsamples)
}
if (is.vector(sim.options$sf)) {
all.facs <- sample(sim.options$sf, nsamples, replace = TRUE)
}
if (is.null(sim.options$sf)) {
all.facs <- rep(1, nsamples)
}
}
# effective means
effective.means <- outer(true.means, all.facs, "*")
# make mean expression with beta coefficients added as defined by model matrix
ind = !apply(mod, 2, function(x) { all(x == 1) })
mod = cbind(mod[, ind])
beta = cbind(beta[, ind])
mumat = log2(effective.means + 1) + beta %*% t(mod)
mumat[mumat < 0] = 0
# result count matrix
counts = matrix(stats::rnbinom(nsamples * ngenes, mu = 2 ^ mumat - 1, size = 2 ^ sizevec), ncol = nsamples, nrow = ngenes)
}
if (attr(sim.options, 'param.type') == 'insilico') {
# sample from the mean function provided
true.means = sim.options$means(ngenes)
# associate with dispersion parameter
if (length(sim.options$dispersion) == 1) {
disp = sim.options$dispersion
}
if (is.function(sim.options$dispersion)) {
disp = sim.options$dispersion(true.means)
}
# capture efficiency (size factor)
if (sim.options$size.factors == "equal") {
all.facs <- rep(1, nsamples)
}
if (sim.options$size.factors == "given") {
if (is.function(sim.options$sf)) {
all.facs <- sim.options$sf(nsamples)
}
if (is.vector(sim.options$sf)) {
all.facs <- sample(sim.options$sf, nsamples, replace = TRUE)
}
if (is.null(sim.options$sf)) {
all.facs <- rep(1, nsamples)
}
}
# effective means
effective.means <- outer(true.means, all.facs, "*")
# make mean expression with beta coefficients added as defined by model matrix
ind = !apply(mod, 2, function(x) { all(x == 1) })
mod = cbind(mod[, ind])
beta = cbind(beta[, ind])
mumat = log2(effective.means + 1) + beta %*% t(mod)
mumat[mumat < 0] = 0
# result count matrix
counts = matrix(stats::rnbinom(nsamples * ngenes, mu = 2 ^ mumat - 1, size = 1/disp), ncol = nsamples, nrow = ngenes)
}
return(counts)
}
#####################################################
## draw read counts
#####################################################
#' @importFrom stats rnorm rbinom rnbinom approx
.bulkRNAseq_counts <- function(sim.options, modelmatrix) {
set.seed(sim.options$sim.seed)
print(sim.options$sim.seed)
nsamples = nrow(modelmatrix)
ngenes = sim.options$ngenes
mod = modelmatrix
beta = cbind(sim.options$lfc)
if (attr(sim.options, 'param.type') == 'estimated') {
# define NB params
mu = sim.options$means
meansizefit = sim.options$meansizefit
# sample from the mean parameter observed
index = sample(1:length(mu), size = ngenes, replace = T)
true.means = mu[index]
# estimate size parameter associated with mean values
lmu = log2(true.means + 1)
predsize.mean = stats::approx(meansizefit$x, meansizefit$y, xout = lmu)$y
predsize.sd = stats::approx(meansizefit$x, meansizefit$sd, xout = lmu)$y
sizevec = stats::rnorm(n = length(lmu), mean = predsize.mean, sd = predsize.sd)
# associate dropout probability with mean parameter
p0vec = rep(0, ngenes)
p0s = ifelse(lmu < sim.options$p0.cut, sample(x = sim.options$p0,size = 1), p0vec)
p0mat = t(sapply(p0s, function(x) {
stats::rbinom(nsamples, prob = 1 - x, size = 1)
}, simplify = T))
# capture efficiency (size factor)
if (sim.options$size.factors == "equal") {
all.facs <- rep(1, nsamples)
}
if (sim.options$size.factors == "given") {
if (is.function(sim.options$sf)) {
all.facs <- sim.options$sf(nsamples)
}
if (is.vector(sim.options$sf)) {
all.facs <- sample(sim.options$sf, nsamples, replace = TRUE)
}
if (is.null(sim.options$sf)) {
all.facs <- rep(1, nsamples)
}
}
# effective means
effective.means <- outer(true.means, all.facs, "*")
# make mean expression with beta coefficients added as defined by model matrix
ind = !apply(mod, 2, function(x) { all(x == 1) })
mod = cbind(mod[, ind])
beta = cbind(beta[, ind])
mumat = log2(effective.means + 1) + beta %*% t(mod)
mumat[mumat < 0] = 0
# result count matrix
cnts = matrix(stats::rnbinom(nsamples * ngenes, mu = 2 ^ mumat - 1, size = 2 ^ sizevec), ncol = nsamples, nrow = ngenes)
counts = p0mat * cnts
}
if (attr(sim.options, 'param.type') == 'insilico') {
# sample from the mean function provided
true.means = sim.options$means(ngenes)
# sample dropout parameter
if (is.null(sim.options$p0)) {
p0s <- rep(0, ngenes)
}
if (is.function(sim.options$p0)) {
p0s <- sim.options$p0(ngenes)
}
p0mat = t(sapply(p0s, function(x) {
stats::rbinom(nsamples, prob = 1 - x, size = 1)
}, simplify = T))
# associate with dispersion parameter
if (length(sim.options$dispersion) == 1) {
disp = sim.options$dispersion
}
if (is.function(sim.options$dispersion)) {
disp = sim.options$dispersion(true.means)
}
# capture efficiency (size factor)
if (sim.options$size.factors == "equal") {
all.facs <- rep(1, nsamples)
}
if (sim.options$size.factors == "given") {
if (is.function(sim.options$sf)) {
all.facs <- sim.options$sf(nsamples)
}
if (is.vector(sim.options$sf)) {
all.facs <- sample(sim.options$sf, nsamples, replace = TRUE)
}
if (is.null(sim.options$sf)) {
all.facs <- rep(1, nsamples)
}
}
# effective means
effective.means <- outer(true.means, all.facs, "*")
# make mean expression with beta coefficients added as defined by model matrix
ind = !apply(mod, 2, function(x) { all(x == 1) })
mod = cbind(mod[, ind])
beta = cbind(beta[, ind])
mumat = log2(effective.means + 1) + beta %*% t(mod)
mumat[mumat < 0] = 0
# result count matrix
cnts = matrix(stats::rnbinom(nsamples * ngenes, mu = 2 ^ mumat - 1, size = 1/disp), ncol = nsamples, nrow = ngenes)
counts = p0mat * cnts
}
return(counts)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.