# R/Single_Cell_Simu.R In scDD: Mixture modeling of single-cell RNA-seq data to identify genes with differential distributions

#### Documented in calcMVcalcRPsingleCellSimu

```#' calcRP
#'
#'  Calculate parameter R and P in NB distribution
#'
#' @param Emean Empirical mean
#'
#' @param Evar Empirical variance
#'
#' @references Korthauer KD, Chu LF, Newton MA, Li Y, Thomson J, Stewart R,
#' Kendziorski C. A statistical approach for identifying differential
#' distributions
#' in single-cell RNA-seq experiments. Genome Biology. 2016 Oct 25;17(1):222.
#' \url{https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-
#' 1077-y}
#'
#' @return RP Vector of two elements, first contains method of moments
#' estimator for r and second contains
#'  method of moments estimator for p (parameters of NB distribution)
#'

calcRP <- function(Emean, Evar){
RP <- rep(NA,2)
RP=max(0.01,(Emean)^2/(Evar-Emean));
RP=max(0.01,1-Emean/Evar);
if((RP==0.01)&(RP==0.01))
{
RP=1
RP=0.5
}
RP=min(RP,0.99999999)
return(RP)
}

#' calcMV
#'
#'  Calculate empirical means and variances of selected genes in a given
#'  dataset.
#'
#' @details Calculate empirical means and variances of selected genes in a
#' given dataset. Optionally, multiply
#'  the means and standard deviations by a fold change value, which can also
#'  vary by mean value.  If the mean is
#'  below some specified threshold \code{threshold}, use one fold change value
#'  \code{FC}.  If above the threshold, use the alternate
#'  fold change value \code{FC.thresh}.  Estimates of mean and variance are
#'  robust to outliers.
#'
#' @param a Numeric vector of values to calculate empirical mean and variance.
#'
#' @param FC Fold change for the mean and standard deviation.  Default value
#' is 1.
#'
#' @param FC.thresh Alternate fold change for the mean and standard deviation
#' when the (log nonzero) mean is above the value of \code{threshold}.
#'  Default value is \code{FC}.
#'
#' @param threshold Mean threshold value which dictates which fold change value
#'  to use for multiplying mean and standard deviation.
#'  Default value is Inf (so \code{FC} is always used).
#'
#' @param include.zeroes Logical value indicating whether the zero values
#' should be included in the calculations of the empirical means
#'    and variances.
#'
#' @return MV Vector of two elements, first contains the empirical mean
#' estimate, second contains the empirical variance estimate (optionally
#'   multiplied by a fold change).
#'

calcMV <- function(a, FC=1, FC.thresh=NA, threshold=Inf, include.zeroes=FALSE){
MV <- rep(NA, 2)
if(outliers::grubbs.test(log(a+1))\$p.value<0.01){
a <- a[-which.max(a)]
}

if(!include.zeroes){
a <- a[a!=0]
}

if (mean(log(a[a!=0])) < threshold){
MV <- mean(a)*FC
MV <- var(a)*FC^2
}else{
if (is.na(FC.thresh)){stop("Please specify valid value for alternate fold
change FC.thresh!")}
MV <- mean(a)*FC.thresh
MV <- var(a)*FC.thresh^2
}
return(MV)
}

#' singleCellSimu
#'
#' Called by \code{\link{simulateSet}} to simulate a specified number of genes
#' from
#'   one DD category at a time.
#'
#'
#'
#' @param Dataset1 Numeric matrix of expression values with genes in rows and
#' samples in columns.
#'
#' @param Method Type of simulation should choose from "DE" "DP" "DM" "DB"
#'
#' @param index Reasonable set of genes for simulation
#'
#' @param FC Fold Change values for DE Simulation
#'
#' @param DP Differetial Proportion vector
#'
#' @param modeFC Vector of values to use for fold changes between modes for DP,
#'  DM, and DB.
#'
#' @param Validation Show Validation plots or not
#'
#' @param numGenes numeric value for the number of genes to simulate
#'
#' @param numDE numeric value for the number of genes that will differ between
#'  two conditions
#'
#' @param generateZero Specification of how to generate the zero values.
#' If "\code{empirical}" (default), the
#'  observed proportion of zeroes in each gene is used for the simuated data,
#'  and the nonzeroes are simulated from a
#'  truncated negative binomial distribution.  If "\code{simulated}", all
#'  values are simulated out of
#'  a negative binomial distribution, includling the zeroes.
#'  If "\code{constant}", then each gene has a fixed
#'  proportion of zeroes equal to \code{constantZero}.
#'
#' @param constantZero Numeric value between 0 and 1 that indicates the fixed
#' proportion of zeroes for every gene.
#'  Ignored if \code{generateZero} method is not equal to "\code{constant}".
#'
#' @param numSamples numeric value for the number of samples in each condition
#' to simulate
#'
#' @param varInflation Optional numeric vector with one element for each
#' condition that corresponds to the multiplicative
#'  variance inflation factor to use when simulating data.  Useful for
#'  sensitivity studies to assess the impact of
#'  confounding effects on differential variance across conditions. Currently
#'  assumes all samples within a
#'  condition are subject to the same variance inflation factor.
#'
#' @importFrom outliers grubbs.test
#'
#' @return Simulated_Data A list object where the first element contains a
#' matrix of
#' the simulated dataset, the second element contains the DEIndex, and the
#' third
#' element contains the fold change (between two conditions for DE, between
#' two modes
#' for DP, DM, and DB).

singleCellSimu <- function(Dataset1, Method, index, FC, modeFC, DP,
Validation=FALSE,
numGenes=1000, numDE=100, numSamples=100,
generateZero=c("empirical", "simulated", "constant"),
constantZero=NULL, varInflation=NULL){
if(!is.null(varInflation)){
if(!length(varInflation)==2){
stop("Error: varInflation vector needs to contain one factor for each
sample or one factor for each condition")
}
}

if(length(generateZero)>1){
generateZero <- generateZero
}
### Select numGenes genes from index
samplename <- sort(sample(index,numGenes,replace=TRUE))

FC2 <- rep(1,nrow=numGenes)

f <- sample(c(rep(modeFC, ceiling(numGenes/length(modeFC)))), numGenes,
replace=FALSE)
for(i in 1:numGenes){
a <- as.numeric(Dataset1[samplename[i],])
FC2[i] <- exp(f[i]*sqrt(log(1+var(a[a!=0])/mean(a[a!=0])^2)))
}

### Calculate means and variances for these genes
Zeropercent_Base <- as.matrix(apply(Dataset1[samplename,,drop=FALSE], 1,
function(a) length(which(a == 0)) /
length(a)))
MV <- matrix(data = 0,nrow = numGenes,ncol = 4)
if(Method %in%  c("DP", "DM")){
MV[,1:2] <- t(sapply(1:length(samplename), function(x)
calcMV(Dataset1[samplename[x],], FC=1, FC.thresh=FC2[x]^(-1/2),
threshold = 3, include.zeroes=FALSE)))
MV[,3:4] <- t(sapply(1:length(samplename), function(x)
calcMV(Dataset1[samplename[x],], FC=1, FC.thresh=FC2[x]^(-1/2),
threshold = 3, include.zeroes=TRUE)))
}else{
MV[,1:2] <- t(sapply(1:length(samplename), function(x)
calcMV(Dataset1[samplename[x],], FC=1, include.zeroes=FALSE)))
MV[,3:4] <- t(sapply(1:length(samplename), function(x)
calcMV(Dataset1[samplename[x],], FC=1, include.zeroes=TRUE)))
}

### Calculate parameter R and P in NB distribution
if(generateZero %in% c("empirical", "constant")){
RP <- t(apply(MV[,1:2,drop=FALSE], 1, function(x) calcRP(x, x)))
}else if (generateZero == "simulated"){
RP <- t(apply(MV[,3:4,drop=FALSE], 1, function(x) calcRP(x, x)))
}else{
stop("Error: Please specify a valid generateZero method!")
}

# if inflating the variance factor with varInflation, apply condition 1's
#factor to this MV matrix
# and calculate the corresponding RP values
if(!is.null(varInflation)){
MV.Infl1 <- MV
MV.Infl1[,c(2,4)] <- MV[,c(2,4)]*varInflation

MV.Infl2 <- MV
MV.Infl2[,c(2,4)] <- MV[,c(2,4)]*varInflation

if(generateZero %in% c("empirical", "constant")){
RP <- cbind(RP, t(apply(MV.Infl1[,1:2,drop=FALSE], 1,
function(x) calcRP(x, x))))
RP <- cbind(RP, t(apply(MV.Infl2[,1:2,drop=FALSE], 1,
function(x) calcRP(x, x))))
}else if (generateZero == "simulated"){
RP <- cbind(RP, t(apply(MV.Infl1[,3:4,drop=FALSE], 1,
function(x) calcRP(x, x))))
RP <- cbind(RP, t(apply(MV.Infl2[,3:4,drop=FALSE], 1,
function(x) calcRP(x, x))))
}
}

### Indentify the relationship between mean and variance
if(generateZero %in% c("empirical", "constant")){
fit <- lm(log(MV[,2])~log(MV[,1]))
} else{
fit <- lm(log(MV[,4])~log(MV[,3]))
}
coeff <- fit\$coefficients

### Simulate data by the chosen method
Simulated_Data <- matrix(data=NA,nrow=numGenes,ncol=2*numSamples)
x <- 1:numGenes
DEIndex <- sample(x,numDE,replace=FALSE)
EEIndex <- x[!(x%in%DEIndex)]

if(Method=="DE")
{
desim <- simuDE(Dataset1, Simulated_Data, DEIndex, samplename,
Zeropercent_Base, f, FC, coeff, RP, modeFC,
generateZero, constantZero, varInflation)
Simulated_Data <- desim[]
DE_FC <- desim[]
} else if(Method=="DP"){
dpsim <- simuDP(Dataset1, Simulated_Data, DEIndex, samplename,
Zeropercent_Base, f, FC2, coeff, RP, modeFC, DP,
generateZero, constantZero, varInflation)
Simulated_Data <- dpsim[]
DE_FC <- dpsim[]
}else if(Method=="DM"){

dmsim <- simuDM(Dataset1, Simulated_Data, DEIndex, samplename,
Zeropercent_Base, f, FC2, coeff, RP, modeFC,
generateZero, constantZero, varInflation)
Simulated_Data <- dmsim[]
DE_FC <- dmsim[]
}else if(Method=="DB"){

DBsim <- simuDB(Dataset1, Simulated_Data, DEIndex, samplename,
Zeropercent_Base, f, FC2, coeff, RP, modeFC, DP,
generateZero, constantZero, varInflation)
Simulated_Data <- DBsim[]
DE_FC <- DBsim[]
}

if(Validation==TRUE){
validation(MV, DEIndex, Zeropercent_Base, Simulated_Data, numGenes)
}
return(list(Simulated_Data, DEIndex, DE_FC))
}
```

## Try the scDD package in your browser

Any scripts or data that you put into this service are public.

scDD documentation built on Nov. 8, 2020, 7:10 p.m.