# Setup -----------------------------------------------------------------
#' @name Setup
#' @aliases Setup
#' @title Setup options for RNA-seq count simulations
#' @description This function generates the settings needed for \code{\link{simulateDE}}.
#' Firstly a set of differential expressed gene IDs with
#' associated fold changes for a given number of genes, simulations and
#' fraction of DE genes is generated. There are also a number of options relating
#' to count simulations such as downsampling.
#' Secondly, the estimated parameters for count simulations of genes (and spikes) are added.
#' @usage Setup(ngenes=NULL, nsims=25,
#' p.DE = 0.1, pLFC = 1, p.G = 1,
#' p.B=NULL, bLFC=NULL, bPattern="uncorrelated",
#' n1=c(20,50,100), n2=c(30,60,120),
#' Thinning = NULL, LibSize='equal',
#' estParamRes, estSpikeRes=NULL,
#' DropGenes=FALSE,
#' setup.seed, verbose = TRUE)
#' @param ngenes is a numeric vector specifying the number of genes to simulate.
#' Default is \code{NULL}, i.e. the number of genes that were deemed expressed after filtering.
#' See \code{\link{estimateParam}} and \code{\link{plotParam}} for more information about the number of genes that were used fo estimation and fitting.
#' @param nsims Number of simulations to run. Default is 25.
#' @param p.DE Numeric vector between 0 and 1 representing
#' the percentage of genes being differentially expressed due to phenotype,
#' i.e. biological signal. Default is \code{0.1}.
#' @param pLFC The log phenotypic fold change for DE genes. This can be:
#' (1) a constant, e.g. 2;
#' (2) a vector of values with length being number of DE genes.
#' If the input is a vector and the length is not the number of DE genes,
#' it will be sampled with replacement to generate log fold changes;
#' (3) a function that takes an integer n, and generates a vector of length n,
#' e.g. function(x) rnorm(x, mean=0, sd=1.5).
#' Default is \code{1}.
#' Please note that the fold change should be on \code{\link[base]{log2}} scale!
#' @param p.G Numeric vector indicating the proportion of replicates per group
#' that express the phenotypic fold change. Default is \code{1}, this means all show the expressiond difference.
#' For example, if \code{0.5} and \code{n1 = 10} and \code{n2 = 8},
#' then only 5 replicates in group 1 and 4 replicates in group 2 express the phenotypic fold change.
#' @param p.B Numeric vector between 0 and 1 representing the percentage of genes
#' being differentially expressed between batches.
#' Default is \code{NULL}, i.e. no batch effect.
#' @param bLFC The log batch fold change for all genes. This can be:
#' (1) a constant, e.g. 2;
#' (2) a vector of values with length being number of all genes.
#' If the input is a vector and the length is not the number of total genes,
#' it will be sampled with replacement to generate log fold changes;
#' (3) a function that takes an integer n, and generates a vector of length n,
#' e.g. function(x) rnorm(x, mean=0, sd=1.5).
#' Note that the simulations of only two batches is implemented.
#' Default is \code{NULL}, i.e. no batch effect.
#' Please note that the fold change should be on \code{\link[base]{log2}} scale!
#' @param bPattern Character vector for batch effect pattern if \code{p.B} is non-null.
#' Possible options include:
#' "uncorrelated", "orthogonal" and " correlated".
#' Default is \code{"uncorrelated"}.
#' @param n1,n2 Integer vectors specifying the number of biological replicates in each group.
#' Default values are n1=c(20,50,100) and n2=c(30,60,120).
#' The vectors need to have the same number of entries, i.e. length.
#' @param Thinning Numeric vector specifying the downsampling.
#' It has to be the same length and order as the vector \code{n1}.
#' This is an implementation of \code{\link[edgeR]{thinCounts}}.
#' The vector entries should be a proportion between 0 and 1,
#' e.g. \code{0.75} means that each sample in that group
#' will have on average 75\% sequencing depth compared
#' to the original sequencing depth as listed in \code{\link{estimateParam}}.
#' Note that no upsampling is possible, i.e. defining a proportion greater than 1.
#' The default is \code{NULL}, meaning no downsampling.
#' @param LibSize Size factors representing sample-specific differences/biases
#' in expected mean values of counts:
#' \code{"equal"} or \code{"given"}. The default is \code{"equal"},
#' i.e. equal size factor of 1.
#' If the user defines it as \code{"given"}, the size factors are sampled from
#' the estimated size factors of \code{\link{estimateParam}}.
#' @param estParamRes The estimated simulation parameters for genes. This can be:
#' (1) The output of \code{\link{estimateParam}}.
#' (2) A string specifying the name of precalculated estimates, see details.
#' @param estSpikeRes The spike-in simulation parameters generated by \code{\link{estimateSpike}}.
#' These are needed for applying spike-in-dependent normalisation methods.
#' Default is \code{NULL}, i.e. no spike-in count simulations.
#' @param DropGenes By default, the estimated parameters and fit based on genes that
#' were defined as expressed are used for simulations.
#' By setting this parameter to \code{TRUE}, a fraction of genes will be dropouts.
#' The dropout genes are defined in \code{\link{estimateParam}} using the \code{GeneFilter} option
#' and can be plotted with \code{\link{plotParam}}.
#' Default is \code{FALSE}, i.e. no gene expression dropouts.
#' @param setup.seed Setup seed.
#' @param verbose Logical vector to indicate whether to print function information.
#' Default is \code{TRUE}.
#'
#' @return A complex list with the following entries:
#' \item{DESetup}{A list of simulation options relating to the Differential Expression Setup:
#' number of genes simulated (ngenes); number of simulation iterations (nsims);
#' IDs of DE genes (DEid) which is a list (length=nsims) of vectors (length=ngenes*p.DE);
#' log fold change of genes (pLFC) which is a list (length=nsims) of vectors (length=ngenes);
#' similarly for batch effects (Bid and bLFC); list containing simulation seeds (sim.seed).}
#' \item{SimSetup}{A list of simulation options relating to the Simulation Setup:
#' the number of samples per group (n1 and n2); simulating spike-ins (spikeIns);
#' Thinning parameters (Thinning, thinSpike); Library Size Factors (LibSize);
#' dropout genes (DropGenes and DropRate).}
#' \item{estParamRes}{A list object containing the gene simulation parameters provided.}
#' \item{estSpikeRes}{A list object containing the spike-in simulation parameters provided.}
#' @author Beate Vieth
#' @examples
#' \dontrun{
#' # estimate gene parameters
#' data("SmartSeq2_Gene_Read_Counts")
#' Batches = data.frame(Batch = sapply(strsplit(colnames(SmartSeq2_Gene_Read_Counts),
#' "_"), "[[", 1),
#' stringsAsFactors = F,
#' row.names = colnames(SmartSeq2_Gene_Read_Counts))
#' data("GeneLengths_mm10")
#' estparam_gene <- estimateParam(countData = SmartSeq2_Gene_Read_Counts,
#' readData = NULL,
#' batchData = Batches,
#' spikeData = NULL, spikeInfo = NULL,
#' Lengths = GeneLengths_mm10, MeanFragLengths = NULL,
#' RNAseq = 'singlecell', Protocol = 'Read',
#' Distribution = 'ZINB', Normalisation = "scran",
#' GeneFilter = 0.1, SampleFilter = 3,
#' sigma = 1.96, NCores = NULL, verbose = TRUE)
#' # estimate spike parameters
#' data("SmartSeq2_SpikeIns_Read_Counts")
#' data("SmartSeq2_SpikeInfo")
#' Batches = data.frame(Batch = sapply(strsplit(colnames(SmartSeq2_SpikeIns_Read_Counts),
#' "_"), "[[", 1),
#' stringsAsFactors = F,
#' row.names = colnames(SmartSeq2_SpikeIns_Read_Counts))
#' estparam_spike <- estimateSpike(spikeData = SmartSeq2_SpikeIns_Read_Counts,
#' spikeInfo = SmartSeq2_SpikeInfo,
#' MeanFragLength = NULL,
#' batchData = Batches,
#' Normalisation = 'depth')
#' # define log fold change
#' p.lfc <- function(x) sample(c(-1,1), size=x,replace=T)*rgamma(x, shape = 1, rate = 2)
#' # set up simulations
#' setupres <- Setup(ngenes = 10000, nsims = 25,
#' p.DE = 0.1, pLFC = p.lfc,
#' n1 = c(20,50,100), n2 = c(30,60,120),
#' Thinning = c(1,0.9,0.8), LibSize = 'given',
#' estParamRes = estparam_gene,
#' estSpikeRes = estparam_spike,
#' DropGenes = TRUE,
#' setup.seed = 52679, verbose = TRUE)
#' }
#' @rdname Setup
#' @export
Setup <- function(ngenes = NULL, nsims = 25,
p.DE = 0.1, pLFC = 1, p.G = 1,
p.B = NULL, bLFC = NULL, bPattern = 'uncorrelated',
n1 = c(20,50,100), n2 = c(30,60,120),
Thinning = NULL, LibSize = 'equal',
estParamRes, estSpikeRes = NULL,
DropGenes = FALSE,
setup.seed, verbose = TRUE) {
## DE Setup:
if(missing(setup.seed)){
setup.seed = sample(1:1000000, size = 1)
}
set.seed(setup.seed)
if(verbose) {message(paste0("Setup Seed: ", setup.seed))}
nogenes <- ngenes
detect.genes <- estParamRes$Parameters$Raw$ngenes
est.genes <- estParamRes$Parameters$Filtered$ngenes
if(isTRUE(DropGenes)){
if(is.null(nogenes)){
ngenes <- detect.genes
message(paste0("You have not defined the number of genes to simulate. Therefore, given that the expression of ", detect.genes, " was detected, that number of genes will be simulated."))
SwReplace = FALSE
}
if(is.numeric(nogenes)){
ngenes <- nogenes
if(ngenes <= est.genes){
message(paste0("You have chosen to simulate the expression of ",
ngenes, " genes, which will be randomly drawn without replacement from the observed expression of ", est.genes, " genes."))
SwReplace = FALSE
}
if(ngenes > est.genes){
message(paste0("You have chosen to simulate the expression of ",
ngenes, " genes, which will be randomly drawn with replacement from the observed expression of ", est.genes, " genes."))
SwReplace = TRUE
}
}
if(all(is.na(estParamRes$Parameters$DropGene))){
message(paste0("You want to include dropout genes, but the estParamRes object does not contain dropout genes.
Setting DropGenes to FALSE."))
DropGenes = FALSE
}
if(all(!is.na(estParamRes$Parameters$DropGene))){
DropRate = estParamRes$Parameters$DropGene$ngenes / detect.genes
message(paste0("From the simulated ",
ngenes, " genes, ", round(DropRate*100), "% will be dropouts."))
}
}
if(!isTRUE(DropGenes)){
if(is.null(nogenes)){
ngenes <- est.genes
message(paste0("You have not defined the number of genes to simulate. Therefore, given that the expression of ", est.genes, " could be estimated, that number of genes will be simulated."))
SwReplace = FALSE
}
if(is.numeric(nogenes)){
ngenes <- nogenes
if(ngenes <= est.genes){
message(paste0("You have chosen to simulate the expression of ",
ngenes, " genes, which will be randomly drawn without replacement from the observed expression of ", detect.genes, " genes."))
SwReplace = FALSE
}
if(ngenes > est.genes){
message(paste0("You have chosen to simulate the expression of ",
ngenes, " genes, which will be randomly drawn with replacement from the observed expression of ", detect.genes, " genes."))
SwReplace = TRUE
}
}
DropRate <- NULL
}
if(any(c(p.G > 1, p.G <= 0.01))){
stop(message(paste0("You wish to define a proportion of replicates per group only expresses the defined phenotypic fold change but the value is outside the allowed proportions [0.01,1]. Aborting.")))
}
nDE = round(ngenes*p.DE)
if(!is.null(p.B)) { nB = round(ngenes*p.B) }
DEids = Bids = plfcs = blfcs = NULL
for (i in 1:nsims) {
# generate a random id for DE genes
DEid <- sample(ngenes, nDE, replace = FALSE)
DEids[[i]] <- DEid
## generate lfc for all genes: 0 for nonDE and LFC for DE
plfc = rep(0, ngenes)
plfc[DEid] = .setFC(pLFC, nDE, k=2)
plfcs[[i]] = plfc
if(!is.null(bLFC)) {
# generate a random id for batch affected genes
Bid <- sample(ngenes, nB, replace = FALSE)
Bids[[i]] <- Bid
## generate lfc for all genes: 0 for nonDE and LFC for DE
blfc = rep(0, ngenes)
blfc[Bid] = .setFC(bLFC, nB, k=2)
blfcs[[i]] = blfc
}
}
sim.seed = as.vector(sample(1:1000000, size = nsims, replace = F))
## Simulation Setup:
if (!length(n1) == length(n2)) { stop("n1 and n2 must have the same length!") }
if(is.null(estSpikeRes)){
spikeIns = FALSE
thinSpike = FALSE
}
if(!is.null(estSpikeRes)){
spikeIns = TRUE
}
if(is.null(Thinning)){
thinSpike = FALSE
}
if(!is.null(Thinning)){
if(!is.null(Thinning) && !length(Thinning) == length(n1)) {
stop(message(paste0("You wish to use binomial thinning but the Thinning vector is not the same length as the sample size vectors. Aborting.")))
}
if(!is.null(Thinning) && any(c(Thinning > 1, Thinning <= 0))) {
stop(message(paste0("You wish to use binomial thinning but the Thinning vector contains values outside the allowed proportions [0,1]. Aborting.")))
}
if(!is.null(Thinning) && is.na(estParamRes$Fit$UmiRead) && attr(estParamRes, 'Protocol')=='UMI'){
stop(message(paste0("You wish to use binomial thinning of UMI counts but there is no UMI-Read Fit in estParamRes object.
In order to downsample UMI counts correctly, please provide readData in estimateParam(). Aborting.")))
}
if(!is.null(estSpikeRes)){
thinSpike = TRUE
}
}
# output objects
DESetup <- c(list(DEid = DEids,
Bid = Bids,
pLFC = plfcs,
bLFC = blfcs,
ngenes = ngenes,
Draw = list(MoM='Filtered', Fit='Filtered'),
SwReplace = SwReplace,
nsims = nsims,
p.DE = p.DE,
p.B = p.B,
p.G = p.G,
bPattern = bPattern,
setup.seed = setup.seed),
list(sim.seed = sim.seed),
design = "2grp")
SimSetup <- list(n1=n1,
n2=n2,
Thinning=Thinning,
spikeIns=spikeIns,
thinSpike=thinSpike,
LibSize=LibSize,
DropGenes=DropGenes,
DropRate=DropRate)
# return object
res <- c(DESetup=list(DESetup),
SimSetup=list(SimSetup),
estParamRes=list(estParamRes),
estSpikeRes=list(estSpikeRes))
attr(res, 'RNAseq') <- attr(estParamRes, 'RNAseq')
attr(res, 'Distribution') <- attr(estParamRes, 'Distribution')
attr(res, 'Protocol') <- attr(estParamRes, 'Protocol')
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.