powsimR: Power Simulations for RNA-sequencing

Documented in Setup

# Setup -----------------------------------------------------------------

#' @name Setup
#' @aliases Setup
#' @title Setup options for RNA-seq count simulations
#' @description This function generates the settings needed for \code{\link{simulateDE}}.
#' Firstly a set of differential expressed gene IDs with
#' associated fold changes for a given number of genes, simulations and
#' fraction of DE genes is generated. There are also a number of options relating
#'  to count simulations such as downsampling.
#'  Secondly, the estimated parameters for count simulations of genes (and spikes) are added.
#' @usage Setup(ngenes=NULL, nsims=25,
#' p.DE = 0.1, pLFC = 1, p.G = 1,
#' p.B=NULL, bLFC=NULL, bPattern="uncorrelated",
#' n1=c(20,50,100), n2=c(30,60,120),
#' Thinning = NULL, LibSize='equal',
#' estParamRes, estSpikeRes=NULL,
#' DropGenes=FALSE,
#' setup.seed, verbose = TRUE)
#' @param ngenes is a numeric vector specifying the number of genes to simulate.
#' Default is \code{NULL}, i.e. the number of genes that were deemed expressed after filtering.
#' See \code{\link{estimateParam}} and \code{\link{plotParam}} for more information about the number of genes that were used fo estimation and fitting.
#' @param nsims Number of simulations to run. Default is 25.
#' @param p.DE Numeric vector between 0 and 1 representing
#' the percentage of genes being differentially expressed due to phenotype,
#' i.e. biological signal. Default is \code{0.1}.
#' @param pLFC The log phenotypic fold change for DE genes. This can be:
#' (1) a constant, e.g. 2;
#' (2) a vector of values with length being number of DE genes.
#' If the input is a vector and the length is not the number of DE genes,
#' it will be sampled with replacement to generate log fold changes;
#' (3) a function that takes an integer n, and generates a vector of length n,
#' e.g. function(x) rnorm(x, mean=0, sd=1.5).
#' Default is \code{1}.
#' Please note that the fold change should be on \code{\link[base]{log2}} scale!
#' @param p.G Numeric vector indicating the proportion of replicates per group
#' that express the phenotypic fold change. Default is \code{1}, this means all show the expressiond difference.
#' For example, if \code{0.5} and \code{n1 = 10} and \code{n2 = 8},
#' then only 5 replicates in group 1 and 4 replicates in group 2 express the phenotypic fold change.
#' @param p.B Numeric vector between 0 and 1 representing the percentage of genes
#' being differentially expressed between batches.
#' Default is \code{NULL}, i.e. no batch effect.
#' @param bLFC The log batch fold change for all genes. This can be:
#' (1) a constant, e.g. 2;
#' (2) a vector of values with length being number of all genes.
#' If the input is a vector and the length is not the number of total genes,
#'  it will be sampled with replacement to generate log fold changes;
#' (3) a function that takes an integer n, and generates a vector of length n,
#'  e.g. function(x) rnorm(x, mean=0, sd=1.5).
#' Note that the simulations of only two batches is implemented.
#' Default is \code{NULL}, i.e. no batch effect.
#' Please note that the fold change should be on \code{\link[base]{log2}} scale!
#' @param bPattern Character vector for batch effect pattern if \code{p.B} is non-null.
#' Possible options include:
#' "uncorrelated", "orthogonal" and " correlated".
#' Default is \code{"uncorrelated"}.
#' @param n1,n2 Integer vectors specifying the number of biological replicates in each group.
#' Default values are n1=c(20,50,100) and n2=c(30,60,120).
#' The vectors need to have the same number of entries, i.e. length.
#' @param Thinning Numeric vector specifying the downsampling.
#' It has to be the same length and order as the vector \code{n1}.
#' This is an implementation of \code{\link[edgeR]{thinCounts}}.
#' The vector entries should be a proportion between 0 and 1,
#' e.g. \code{0.75} means that each sample in that group
#' will have on average 75\% sequencing depth compared
#' to the original sequencing depth as listed in \code{\link{estimateParam}}.
#' Note that no upsampling is possible, i.e. defining a proportion greater than 1.
#' The default is \code{NULL}, meaning no downsampling.
#' @param LibSize Size factors representing sample-specific differences/biases
#'  in expected mean values of counts:
#' \code{"equal"} or \code{"given"}. The default is \code{"equal"},
#' i.e. equal size factor of 1.
#' If the user defines it as \code{"given"}, the size factors are sampled from
#' the estimated size factors of \code{\link{estimateParam}}.
#' @param estParamRes The estimated simulation parameters for genes. This can be:
#' (1) The output of \code{\link{estimateParam}}.
#' (2) A string specifying the name of precalculated estimates, see details.
#' @param estSpikeRes The spike-in simulation parameters generated by \code{\link{estimateSpike}}.
#' These are needed for applying spike-in-dependent normalisation methods.
#' Default is \code{NULL}, i.e. no spike-in count simulations.
#' @param DropGenes By default, the estimated parameters and fit based on genes that
#'  were defined as expressed are used for simulations.
#' By setting this parameter to \code{TRUE}, a fraction of genes will be dropouts.
#' The dropout genes are defined in \code{\link{estimateParam}} using the \code{GeneFilter} option
#' and can be plotted with \code{\link{plotParam}}.
#' Default is \code{FALSE}, i.e. no gene expression dropouts.
#' @param setup.seed Setup seed.
#' @param verbose Logical vector to indicate whether to print function information.
#' Default is \code{TRUE}.
#'
#' @return A complex list with the following entries:
#' \item{DESetup}{A list of simulation options relating to the Differential Expression Setup:
#' number of genes simulated (ngenes); number of simulation iterations (nsims);
#' IDs of DE genes (DEid) which is a list (length=nsims) of vectors (length=ngenes*p.DE);
#' log fold change of genes (pLFC) which is a list (length=nsims) of vectors (length=ngenes);
#' similarly for batch effects (Bid and bLFC); list containing simulation seeds (sim.seed).}
#' \item{SimSetup}{A list of simulation options relating to the Simulation Setup:
#' the number of samples per group (n1 and n2); simulating spike-ins (spikeIns);
#' Thinning parameters (Thinning, thinSpike); Library Size Factors (LibSize);
#'  dropout genes (DropGenes and DropRate).}
#' \item{estParamRes}{A list object containing the gene simulation parameters provided.}
#' \item{estSpikeRes}{A list object containing the spike-in simulation parameters provided.}
#' @author Beate Vieth
#' @examples
#' \dontrun{
#' # estimate gene parameters
#' data("SmartSeq2_Gene_Read_Counts")
#' Batches = data.frame(Batch = sapply(strsplit(colnames(SmartSeq2_Gene_Read_Counts),
#'                                     "_"), "[[", 1),
#'                      stringsAsFactors = F,
#'                      row.names = colnames(SmartSeq2_Gene_Read_Counts))
#' data("GeneLengths_mm10")
#' estparam_gene <- estimateParam(countData = SmartSeq2_Gene_Read_Counts,
#'                                readData = NULL,
#'                                batchData = Batches,
#'                                spikeData = NULL, spikeInfo = NULL,
#'                                Lengths = GeneLengths_mm10, MeanFragLengths = NULL,
#'                                RNAseq = 'singlecell', Protocol = 'Read',
#'                                Distribution = 'ZINB', Normalisation = "scran",
#'                                GeneFilter = 0.1, SampleFilter = 3,
#'                               sigma = 1.96, NCores = NULL, verbose = TRUE)
#' # estimate spike parameters
#' data("SmartSeq2_SpikeIns_Read_Counts")
#' data("SmartSeq2_SpikeInfo")
#' Batches = data.frame(Batch = sapply(strsplit(colnames(SmartSeq2_SpikeIns_Read_Counts),
#'                                     "_"), "[[", 1),
#'                      stringsAsFactors = F,
#'                      row.names = colnames(SmartSeq2_SpikeIns_Read_Counts))
#' estparam_spike <- estimateSpike(spikeData = SmartSeq2_SpikeIns_Read_Counts,
#'                                 spikeInfo = SmartSeq2_SpikeInfo,
#'                                 MeanFragLength = NULL,
#'                                 batchData = Batches,
#'                                 Normalisation = 'depth')
#' # define log fold change
#' p.lfc <- function(x) sample(c(-1,1), size=x,replace=T)*rgamma(x, shape = 1, rate = 2)
#' # set up simulations
#' setupres <- Setup(ngenes = 10000, nsims = 25,
#'                   p.DE = 0.1, pLFC = p.lfc,
#'                   n1 = c(20,50,100), n2 = c(30,60,120),
#'                   Thinning = c(1,0.9,0.8), LibSize = 'given',
#'                   estParamRes = estparam_gene,
#'                   estSpikeRes = estparam_spike,
#'                   DropGenes = TRUE,
#'                   setup.seed = 52679, verbose = TRUE)
#' }
#' @rdname Setup
#' @export
  Setup <- function(ngenes = NULL, nsims = 25,
                    p.DE = 0.1, pLFC = 1, p.G = 1,
                    p.B = NULL, bLFC = NULL, bPattern = 'uncorrelated',
                    n1 = c(20,50,100), n2 = c(30,60,120),
                    Thinning = NULL, LibSize = 'equal',
                    estParamRes, estSpikeRes = NULL,
                    DropGenes = FALSE,
                    setup.seed, verbose = TRUE) {

  ## DE Setup:
  if(missing(setup.seed)){
    setup.seed = sample(1:1000000, size = 1)
  }
  set.seed(setup.seed)
  if(verbose) {message(paste0("Setup Seed: ", setup.seed))}

  nogenes <- ngenes
  detect.genes <- estParamRes$Parameters$Raw$ngenes
  est.genes <- estParamRes$Parameters$Filtered$ngenes

  if(isTRUE(DropGenes)){
    if(is.null(nogenes)){
      ngenes <- detect.genes
      message(paste0("You have not defined the number of genes to simulate. Therefore, given that the expression of ", detect.genes, " was detected, that number of genes will be simulated."))
      SwReplace = FALSE
    }
    if(is.numeric(nogenes)){
      ngenes <- nogenes
      if(ngenes <= est.genes){
        message(paste0("You have chosen to simulate the expression of ",
                       ngenes, " genes, which will be randomly drawn without replacement from the observed expression of ", est.genes, " genes."))
        SwReplace = FALSE
      }
      if(ngenes > est.genes){
        message(paste0("You have chosen to simulate the expression of ",
                       ngenes, " genes, which will be randomly drawn with replacement from the observed expression of ", est.genes, " genes."))
        SwReplace = TRUE
      }
    }

    if(all(is.na(estParamRes$Parameters$DropGene))){
      message(paste0("You want to include dropout genes, but the estParamRes object does not contain dropout genes.
                     Setting DropGenes to FALSE."))
      DropGenes = FALSE
    }
    if(all(!is.na(estParamRes$Parameters$DropGene))){
      DropRate = estParamRes$Parameters$DropGene$ngenes / detect.genes
      message(paste0("From the simulated ",
                     ngenes, " genes, ", round(DropRate*100), "% will be dropouts."))
    }

  }

  if(!isTRUE(DropGenes)){
    if(is.null(nogenes)){
      ngenes <- est.genes
      message(paste0("You have not defined the number of genes to simulate. Therefore, given that the expression of ", est.genes, " could be estimated, that number of genes will be simulated."))
      SwReplace = FALSE
    }
    if(is.numeric(nogenes)){
      ngenes <- nogenes
      if(ngenes <= est.genes){
        message(paste0("You have chosen to simulate the expression of ",
                       ngenes, " genes, which will be randomly drawn without replacement from the observed expression of ", detect.genes, " genes."))
        SwReplace = FALSE
      }
      if(ngenes > est.genes){
        message(paste0("You have chosen to simulate the expression of ",
                       ngenes, " genes, which will be randomly drawn with replacement from the observed expression of ", detect.genes, " genes."))
        SwReplace = TRUE
      }
    }

    DropRate <- NULL
  }


  if(any(c(p.G > 1, p.G <= 0.01))){
    stop(message(paste0("You wish to define a proportion of replicates per group only expresses the defined phenotypic fold change but the value is outside the allowed proportions [0.01,1]. Aborting.")))
  }

  nDE = round(ngenes*p.DE)
  if(!is.null(p.B)) { nB = round(ngenes*p.B) }
  DEids = Bids = plfcs = blfcs = NULL
  for (i in 1:nsims) {
    # generate a random id for DE genes
    DEid <- sample(ngenes, nDE, replace = FALSE)
    DEids[[i]] <- DEid
    ## generate lfc for all genes: 0 for nonDE and LFC for DE
    plfc = rep(0, ngenes)
    plfc[DEid] = .setFC(pLFC, nDE, k=2)
    plfcs[[i]] = plfc
    if(!is.null(bLFC)) {
      # generate a random id for batch affected genes
      Bid <- sample(ngenes, nB, replace = FALSE)
      Bids[[i]] <- Bid
      ## generate lfc for all genes: 0 for nonDE and LFC for DE
      blfc = rep(0, ngenes)
      blfc[Bid] = .setFC(bLFC, nB, k=2)
      blfcs[[i]] = blfc
    }
  }

  sim.seed = as.vector(sample(1:1000000, size = nsims, replace = F))

  ## Simulation Setup:
  if (!length(n1) == length(n2)) { stop("n1 and n2 must have the same length!") }

  if(is.null(estSpikeRes)){
    spikeIns = FALSE
    thinSpike = FALSE
  }
  if(!is.null(estSpikeRes)){
    spikeIns = TRUE
  }

  if(is.null(Thinning)){
    thinSpike = FALSE
  }
  if(!is.null(Thinning)){
    if(!is.null(Thinning) && !length(Thinning) == length(n1)) {
      stop(message(paste0("You wish to use binomial thinning but the Thinning vector is not the same length as the sample size vectors. Aborting.")))
    }
    if(!is.null(Thinning) && any(c(Thinning > 1, Thinning <= 0))) {
      stop(message(paste0("You wish to use binomial thinning but the Thinning vector contains values outside the allowed proportions [0,1]. Aborting.")))
    }
    if(!is.null(Thinning) && is.na(estParamRes$Fit$UmiRead) && attr(estParamRes, 'Protocol')=='UMI'){
      stop(message(paste0("You wish to use binomial thinning of UMI counts but there is no UMI-Read Fit in estParamRes object.
      In order to downsample UMI counts correctly, please provide readData in estimateParam(). Aborting.")))
    }
    if(!is.null(estSpikeRes)){
      thinSpike = TRUE
    }
  }

  # output objects
  DESetup <- c(list(DEid = DEids,
                    Bid = Bids,
                    pLFC = plfcs,
                    bLFC = blfcs,
                    ngenes = ngenes,
                    Draw = list(MoM='Filtered', Fit='Filtered'),
                    SwReplace = SwReplace,
                    nsims = nsims,
                    p.DE = p.DE,
                    p.B = p.B,
                    p.G = p.G,
                    bPattern = bPattern,
                    setup.seed = setup.seed),
               list(sim.seed = sim.seed),
               design = "2grp")

  SimSetup <- list(n1=n1,
                   n2=n2,
                   Thinning=Thinning,
                   spikeIns=spikeIns,
                   thinSpike=thinSpike,
                   LibSize=LibSize,
                   DropGenes=DropGenes,
                   DropRate=DropRate)

  # return object
  res <- c(DESetup=list(DESetup),
           SimSetup=list(SimSetup),
           estParamRes=list(estParamRes),
           estSpikeRes=list(estSpikeRes))

  attr(res, 'RNAseq') <- attr(estParamRes, 'RNAseq')
  attr(res, 'Distribution') <- attr(estParamRes, 'Distribution')
  attr(res, 'Protocol') <- attr(estParamRes, 'Protocol')

  return(res)
}