R/Single_Cell_Simu.R

Defines functions singleCellSimu calcMV calcRP

Documented in calcMV calcRP singleCellSimu

#' calcRP
#'
#'  Calculate parameter R and P in NB distribution
#'
#' @param Emean Empirical mean
#'
#' @param Evar Empirical variance
#' 
#' @references Korthauer KD, Chu LF, Newton MA, Li Y, Thomson J, Stewart R, 
#' Kendziorski C. A statistical approach for identifying differential 
#' distributions
#' in single-cell RNA-seq experiments. Genome Biology. 2016 Oct 25;17(1):222. 
#' \url{https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-
#' 1077-y}
#' 
#' @return RP Vector of two elements, first contains method of moments 
#' estimator for r and second contains 
#'  method of moments estimator for p (parameters of NB distribution)
#'   

calcRP <- function(Emean, Evar){
  RP <- rep(NA,2)
  RP[1]=max(0.01,(Emean)^2/(Evar-Emean));
  RP[2]=max(0.01,1-Emean/Evar);
  if((RP[1]==0.01)&(RP[2]==0.01))
  {
    RP[1]=1
    RP[2]=0.5
  }
  RP[2]=min(RP[2],0.99999999)
  return(RP)
}


#' calcMV
#'
#'  Calculate empirical means and variances of selected genes in a given 
#'  dataset.
#'  
#' @details Calculate empirical means and variances of selected genes in a 
#' given dataset. Optionally, multiply
#'  the means and standard deviations by a fold change value, which can also 
#'  vary by mean value.  If the mean is
#'  below some specified threshold \code{threshold}, use one fold change value 
#'  \code{FC}.  If above the threshold, use the alternate 
#'  fold change value \code{FC.thresh}.  Estimates of mean and variance are 
#'  robust to outliers.
#'
#' @param a Numeric vector of values to calculate empirical mean and variance.
#'
#' @param FC Fold change for the mean and standard deviation.  Default value 
#' is 1.
#' 
#' @param FC.thresh Alternate fold change for the mean and standard deviation 
#' when the (log nonzero) mean is above the value of \code{threshold}.  
#'  Default value is \code{FC}.
#' 
#' @param threshold Mean threshold value which dictates which fold change value
#'  to use for multiplying mean and standard deviation.
#'  Default value is Inf (so \code{FC} is always used).
#'  
#' @param include.zeroes Logical value indicating whether the zero values 
#' should be included in the calculations of the empirical means 
#'    and variances.
#' 
#' @return MV Vector of two elements, first contains the empirical mean 
#' estimate, second contains the empirical variance estimate (optionally 
#'   multiplied by a fold change).
#'   

calcMV <- function(a, FC=1, FC.thresh=NA, threshold=Inf, include.zeroes=FALSE){
  MV <- rep(NA, 2)
  if(outliers::grubbs.test(log(a+1))$p.value<0.01){
    a <- a[-which.max(a)]
  }
  
  if(!include.zeroes){
    a <- a[a!=0]
  }
  
  if (mean(log(a[a!=0])) < threshold){
    MV[1] <- mean(a)*FC
    MV[2] <- var(a)*FC^2
  }else{
    if (is.na(FC.thresh)){stop("Please specify valid value for alternate fold 
                               change FC.thresh!")}
    MV[1] <- mean(a)*FC.thresh
    MV[2] <- var(a)*FC.thresh^2
  }
  return(MV)
}

#' singleCellSimu
#'
#' Called by \code{\link{simulateSet}} to simulate a specified number of genes 
#' from 
#'   one DD category at a time.
#' 
#'
#'
#' @param Dataset1 Numeric matrix of expression values with genes in rows and 
#' samples in columns.
#'
#' @param Method Type of simulation should choose from "DE" "DP" "DM" "DB"
#'
#' @param index Reasonable set of genes for simulation
#'
#' @param FC Fold Change values for DE Simulation
#'
#' @param DP Differetial Proportion vector
#' 
#' @param modeFC Vector of values to use for fold changes between modes for DP,
#'  DM, and DB.
#'
#' @param Validation Show Validation plots or not
#' 
#' @param numGenes numeric value for the number of genes to simulate
#'
#' @param numDE numeric value for the number of genes that will differ between
#'  two conditions
#'  
#' @param generateZero Specification of how to generate the zero values.  
#' If "\code{empirical}" (default), the
#'  observed proportion of zeroes in each gene is used for the simuated data, 
#'  and the nonzeroes are simulated from a 
#'  truncated negative binomial distribution.  If "\code{simulated}", all 
#'  values are simulated out of
#'  a negative binomial distribution, includling the zeroes.  
#'  If "\code{constant}", then each gene has a fixed 
#'  proportion of zeroes equal to \code{constantZero}.
#'  
#' @param constantZero Numeric value between 0 and 1 that indicates the fixed 
#' proportion of zeroes for every gene.
#'  Ignored if \code{generateZero} method is not equal to "\code{constant}". 
#' 
#' @param numSamples numeric value for the number of samples in each condition 
#' to simulate
#' 
#' @param varInflation Optional numeric vector with one element for each 
#' condition that corresponds to the multiplicative 
#'  variance inflation factor to use when simulating data.  Useful for 
#'  sensitivity studies to assess the impact of 
#'  confounding effects on differential variance across conditions. Currently 
#'  assumes all samples within a 
#'  condition are subject to the same variance inflation factor.
#' 
#' @importFrom outliers grubbs.test
#'
#' @return Simulated_Data A list object where the first element contains a 
#' matrix of 
#' the simulated dataset, the second element contains the DEIndex, and the 
#' third 
#' element contains the fold change (between two conditions for DE, between 
#' two modes 
#' for DP, DM, and DB).

singleCellSimu <- function(Dataset1, Method, index, FC, modeFC, DP, 
                          Validation=FALSE, 
                          numGenes=1000, numDE=100, numSamples=100, 
                          generateZero=c("empirical", "simulated", "constant"),
                          constantZero=NULL, varInflation=NULL){
if(!is.null(varInflation)){
  if(!length(varInflation)==2){
    stop("Error: varInflation vector needs to contain one factor for each 
         sample or one factor for each condition")
  }
}
    
if(length(generateZero)>1){
  generateZero <- generateZero[1]
}
### Select numGenes genes from index
samplename <- sort(sample(index,numGenes,replace=TRUE))

FC2 <- rep(1,nrow=numGenes)

f <- sample(c(rep(modeFC, ceiling(numGenes/length(modeFC)))), numGenes, 
            replace=FALSE)
for(i in 1:numGenes){
  a <- as.numeric(Dataset1[samplename[i],])
  FC2[i] <- exp(f[i]*sqrt(log(1+var(a[a!=0])/mean(a[a!=0])^2)))
}

### Calculate means and variances for these genes
Zeropercent_Base <- as.matrix(apply(Dataset1[samplename,,drop=FALSE], 1, 
                                    function(a) length(which(a == 0)) / 
                                      length(a)))           
MV <- matrix(data = 0,nrow = numGenes,ncol = 4)
if(Method %in%  c("DP", "DM")){
  MV[,1:2] <- t(sapply(1:length(samplename), function(x) 
    calcMV(Dataset1[samplename[x],], FC=1, FC.thresh=FC2[x]^(-1/2), 
           threshold = 3, include.zeroes=FALSE)))
  MV[,3:4] <- t(sapply(1:length(samplename), function(x) 
    calcMV(Dataset1[samplename[x],], FC=1, FC.thresh=FC2[x]^(-1/2), 
           threshold = 3, include.zeroes=TRUE)))
}else{
  MV[,1:2] <- t(sapply(1:length(samplename), function(x) 
    calcMV(Dataset1[samplename[x],], FC=1, include.zeroes=FALSE)))
  MV[,3:4] <- t(sapply(1:length(samplename), function(x) 
    calcMV(Dataset1[samplename[x],], FC=1, include.zeroes=TRUE)))
}

### Calculate parameter R and P in NB distribution
if(generateZero %in% c("empirical", "constant")){
  RP <- t(apply(MV[,1:2,drop=FALSE], 1, function(x) calcRP(x[1], x[2])))
}else if (generateZero == "simulated"){
  RP <- t(apply(MV[,3:4,drop=FALSE], 1, function(x) calcRP(x[1], x[2])))
}else{
  stop("Error: Please specify a valid generateZero method!")
}

# if inflating the variance factor with varInflation, apply condition 1's 
#factor to this MV matrix
# and calculate the corresponding RP values
if(!is.null(varInflation)){
  MV.Infl1 <- MV
  MV.Infl1[,c(2,4)] <- MV[,c(2,4)]*varInflation[1] 
  
  MV.Infl2 <- MV
  MV.Infl2[,c(2,4)] <- MV[,c(2,4)]*varInflation[2] 
  
  if(generateZero %in% c("empirical", "constant")){
    RP <- cbind(RP, t(apply(MV.Infl1[,1:2,drop=FALSE], 1, 
                            function(x) calcRP(x[1], x[2]))))
    RP <- cbind(RP, t(apply(MV.Infl2[,1:2,drop=FALSE], 1, 
                            function(x) calcRP(x[1], x[2]))))
  }else if (generateZero == "simulated"){
    RP <- cbind(RP, t(apply(MV.Infl1[,3:4,drop=FALSE], 1, 
                            function(x) calcRP(x[1], x[2]))))
    RP <- cbind(RP, t(apply(MV.Infl2[,3:4,drop=FALSE], 1, 
                            function(x) calcRP(x[1], x[2]))))
  }
}


### Indentify the relationship between mean and variance
if(generateZero %in% c("empirical", "constant")){
  fit <- lm(log(MV[,2])~log(MV[,1]))
} else{
  fit <- lm(log(MV[,4])~log(MV[,3]))
}
coeff <- fit$coefficients


### Simulate data by the chosen method
Simulated_Data <- matrix(data=NA,nrow=numGenes,ncol=2*numSamples)
x <- 1:numGenes
DEIndex <- sample(x,numDE,replace=FALSE)
EEIndex <- x[!(x%in%DEIndex)]


if(Method=="DE")
{
	desim <- simuDE(Dataset1, Simulated_Data, DEIndex, samplename, 
	                Zeropercent_Base, f, FC, coeff, RP, modeFC,
	                generateZero, constantZero, varInflation)
	Simulated_Data <- desim[[1]]
  DE_FC <- desim[[2]]
} else if(Method=="DP"){
  dpsim <- simuDP(Dataset1, Simulated_Data, DEIndex, samplename, 
                  Zeropercent_Base, f, FC2, coeff, RP, modeFC, DP,
                  generateZero, constantZero, varInflation)
  Simulated_Data <- dpsim[[1]]
  DE_FC <- dpsim[[2]]
}else if(Method=="DM"){
  
  dmsim <- simuDM(Dataset1, Simulated_Data, DEIndex, samplename, 
                  Zeropercent_Base, f, FC2, coeff, RP, modeFC,
                  generateZero, constantZero, varInflation)
  Simulated_Data <- dmsim[[1]]
  DE_FC <- dmsim[[2]]
}else if(Method=="DB"){
  
  DBsim <- simuDB(Dataset1, Simulated_Data, DEIndex, samplename, 
                  Zeropercent_Base, f, FC2, coeff, RP, modeFC, DP,
                  generateZero, constantZero, varInflation)
  Simulated_Data <- DBsim[[1]]
  DE_FC <- DBsim[[2]]
}

if(Validation==TRUE){
  validation(MV, DEIndex, Zeropercent_Base, Simulated_Data, numGenes)
}
return(list(Simulated_Data, DEIndex, DE_FC))
}

Try the scDD package in your browser

Any scripts or data that you put into this service are public.

scDD documentation built on Nov. 8, 2020, 7:10 p.m.