#' Simulate data with gene-wise dispersion parameters
simulate_counts=function(K,B,n,g,
cls,SF,
beta,phi,LFCg_mat,
batch,batch_effects,sigma_b,sigma_g,DEb_ID,disp){ # batch of 0: no batch effect, batch of 1: yes effect
# center batch at 0: lower batch gets downregulated, higher batch gets upregulated
# if B=1, batch_eff = 0 with some noise by sigma_b
#batch_eff = (batch-(B-1)/2)*LFCb+rnorm(n,0,sigma_b) # previous parametrization: -0.5*LFCb or +0.5*LFCb for B=2, pred had +0*LFCb
#batch_effects = c(-1,1) ## FOR B=2. Need general way --> user input.
# batch_effects should be the same length as the number of batches, or length(unique(batch))
batch_eff = batch_effects[batch+1] # batch goes from 0, 1, ... for batch_pred, just 0 input for batch --> 1 batch.
cts <- matrix(rep(0,times=g*n),nrow=g)
b = beta + LFCg_mat # + noise_mat
if(disp=="gene"){
phi_mat = matrix(phi,nrow=g,ncol=K) # construct matrix with same phi values for all cls if g-disp
}
sigma_mat = matrix(rnorm(n*g,0,sigma_g),nrow=g,ncol=n)
mu_mat = matrix(0,nrow=g,ncol=n)
for(i in 1:n){
sigma_mat[,i] = rnorm(g,0,sigma_g)
mu_mat[,i] = SF[i]*2^(b[,cls[i]] + sigma_mat[,i] + DEb_ID*batch_eff[i])
cts[,i] = rnbinom( g, size = 1/phi_mat[,cls[i]], mu = mu_mat[,i] )
}
return(list(cts=cts,mu_mat=mu_mat,sigma_mat=sigma_mat))
}
#' Simulate RNA-seq bulk gene expression count data
#'
#' Simulate data based on input simulation parameters. Size factors are custom input
#' or simulated from N(1,0.25)
#'
#' @param K integer, number of clusters
#' @param B integer, number of batches
#' @param g integer, number of genes
#' @param n integer, number of samples
#' @param pK vector of length K (optional): proportion of samples in each cluster
#' @param pB vector of length B (optional): proportion of samples in each batch
#' @param LFCg numeric, LFC for cluster-discriminatory genes
#' @param pDEg numeric, proportion of genes that are cluster-discriminatory
#' @param sigma_g numeric, Gaussian noise added to each gene/sample N(0,sigma_g). Default is 0.1
#' @param LFCb numeric, LFC for genes that are differentially expressed across batch. Default is 1.
#' @param pDEb numeric, proportion of genes that are differentially expressed across batch. Default is 0.5.
#' @param sigma_b numeric, batch-specific Gaussian noise (default 0).
#' @param beta0 numeric, baseline log2 expression for each gene before LFC is applied
#' @param phi0 numeric, baseline overdispersion for each gene
#' @param SF vector of length n (optional), custom size factors from DESeq2. If NULL, simulated from N(1,0.25)
#' @param nsims integer, number of datasets to simulate given the input conditions. Default is 25.
#' @param disp string, either 'gene' or 'cluster' to simulate gene-level or cluster-level dispersions. Default is gene-level. Input phi must be g x K matrix if disp='cluster'
#' @param n_pred integer, number of samples in simulated prediction dataset. Default is 25
#' @param sim_batch_pred boolean: FALSE (no batch effect for prediction samples) or TRUE (batch effect)
#' @param LFCb_pred LFCb for batch-affected genes in prediction set. By default (NULL), = max(batch_effects) + LFCb/2: larger batch effect than training.
#' @param save_file boolean: TRUE (save each set of simulations)
#' @param save_dir string (optional): directory to save files. Default: 'Simulations/<sigma_g>_<sigma_b>/B<B>'
#' @param save_pref string (optional): prefix of file name to save simulated data to. Default: '<K>_<n>_<LFCg>_<pDEg>_<beta0>_<phi0>'
#'
#' @return if save_file=TRUE, then saved file in '<save_dir>/<save_pref>_sim<1:nsims>_data.RData'. Otherwise, list of length 'nsims', with a sim.dat list object for each simulation
#'
#' @author David K. Lim, \email{deelim@live.unc.edu}
#' @references \url{https://github.com/DavidKLim/FSCseq}
#'
#' @examples
#' sim.dat = FSCseq::simulateData(B=1, g=10000, K=2, n=50, LFCg=1, pDEg=0.05, beta0=12, phi0=0.35, nsims=1, save_file=F)[[1]]
#'
#' @export
simulateData<-function(K=2, B=1, g=10000, n=50, pK=NULL, pB=NULL,
LFCg=1, pDEg=0.05, sigma_g=0.1,
LFCb=1, pDEb=0.5, sigma_b=0,
beta0=12, phi0=0.35, SF=NULL,
nsims=25, disp="gene", n_pred=25, sim_batch_pred=FALSE, LFCb_pred=NULL,
save_file=TRUE, save_dir=NULL, save_pref=NULL){
if(B==1){LFCb=0}
if(save_file){
if(is.null(save_dir)){
dir_name1=sprintf("Simulations/%f_%f",sigma_g,sigma_b)
dir_name=sprintf("%s/B%d_LFCb%d",dir_name1,B,LFCb)
ifelse(!dir.exists(dir_name1),
dir.create(dir_name1),
FALSE)
}else{
dir_name=save_dir
}
ifelse(!dir.exists(dir_name),
dir.create(dir_name),
FALSE)
if(is.null(save_pref)){
file_name=sprintf("%d_%d_%f_%f_%f_%f",
K,n,LFCg,pDEg,beta0[1],phi0[1]) # just the first elt of beta0 and phi0
}else{file_name=save_pref}
}
match=match.call()
# add N(0,sigma) noise to each gene/cluster
# introduced sigma_g: draw LFC for each DE gene from N(LFCg,sigma_g). same w/ sigma_b for batch
# introduced sigma: small Gaussian noise N(0,sigma) introduced to beta for all gene/cluster combinations
#### this would make performance worse throughout
# SF: defaults to be simulated N(1,0.25), cutoff at bottom 0.25, cutoff at top by 2 (based on BRCA sample estimates)
# all the diff inputs of beta:
# 1) scalar --> change to matrix(beta,nrow=g,ncol=K)
# 2) vector of length g --> change to matrix(beta,nrow=g,ncol=K)
# 3) matrix of dim 1x1 --> change to matrix(beta,nrow=g,ncol=K)
# 4) matrix of dim gx1 or 1xg --> change to matrix(beta, nrow=g,ncol=K)
# 5) matrix of dim gxK --> matrix(beta, nrow=g,ncol=K)
if(is.null(pK)){ # custom pN (probability of being in each cl K) must be of length K, and add up to 1
pK=rep(1/K,K)
pB=rep(1/B,B)
}
if(disp=="gene"){
if(length(phi0)==1){
phi=rep(phi0,g)
} else if(length(phi) != g){
stop("phi must be length 1 or g")
} else{
phi=phi0 # specifying specific gene-level disp's for all genes
}
} else if(disp=="cluster"){
if(length(phi0)==K){
phi=matrix(phi0,nrow=g,ncol=K,byrow=TRUE)
} else if(length(phi0)!=(g*K)){
stop("phi must be gxK matrix")
} else{
phi=phi0 # specifying specific cluster-level disps for all genes/clusters
}
} else{
stop("specify 'gene' or 'cluster' for disp")
}
if(length(beta0) %in% c(1,g)){
beta=matrix(beta0,nrow=g,ncol=K) # if beta0 is length g, then will repeat for each cluster
} else{
stop("beta0 must be length 1 (common baseline) or g (gene-level baselines)")
}
all_sim.dats = list()
for(i in 1:nsims){
# order params in file name: K, n, LFCg, pDEg, beta, phi (use universally)
if(save_file){
# skip files that are already saved (already simulated)
file.name=sprintf("%s/%s_sim%d_data.RData",
dir_name,file_name,i)
if(file.exists(file.name)){warning("Data with same file name already simulated. Remove existing file to replace. Skipping simulation of existing file"); next}else{print(file.name)}
}
if(is.null(SF)){
# SF's are drawn from N(1,0.25), truncated at (0.25, 2.00)
SF=rnorm(n,1,0.25)
SF[SF<0.25]=0.25
SF[SF>2]=2
SF_pred=rnorm(n_pred,1,0.25) # n_pred < n
SF_pred[SF_pred<0.25]=0.25
SF_pred[SF_pred>2]=2
} else if(length(SF)!=n){
stop("Custom size factors must be of length n")
} else{
# if custom SF's input, then SF_pred are sampled from the custom SF's randomly
SF_pred=sample(SF,n_pred,replace=T)
}
cls=sample(1:K,n,replace=T,prob=pK)
batch=sample(1:B,n,replace=T,prob=pB)-1
DEg_ID=rep(F,g) # DEg across cls: first g*pDEg genes
DEg_ID[1:floor(pDEg*g)]=T
DEb_ID=rep(F,g) # select DEb across batch genes randomly
DEb_ID[sample(1:g,floor(pDEb*g),replace=F)]=T
LFCg_mat = matrix(0,nrow=g,ncol=K)
for(j in 1:floor(pDEg*g)){
signLFC=(j<floor(pDEg*g/2))*2-1 # +1 for j in 1:(DEgenes/2), -1 for rest
LFCg_mat[j,sample(1:K,1,replace=T)] = signLFC*LFCg
}
# make batch_effects be centered around LFCb, +- 0.5 between each consecutive batch
batch_effects = LFCb*( c(1:B)-mean(c(1:B)) )
fit<-simulate_counts(K=K,B=B,n=n,g=g,
cls=cls,SF=SF,
beta=beta,phi=phi,LFCg_mat=LFCg_mat,#noise_mat=noise_mat,
batch=batch,batch_effects=batch_effects,sigma_b=sigma_b,sigma_g=sigma_g,DEb_ID=DEb_ID,disp=disp) # gene-specific disp param
cts = fit$cts; mu_mat = fit$mu_mat; sigma_mat=fit$sigma_mat
cls_pred=sample(1:K,n_pred,replace=T,prob=pK)
batch_pred = rep(B, n_pred) # train batch: 0, 1, .. B-1. pred_batch: B. right now: ONLY ONE BATCH simulated for prediction set
batch_effects_pred =
if(sim_batch_pred){
if(is.null(LFCb_pred)){
max(batch_effects) + LFCb/2 # if batch effects were (-1,1), then this batch effect is 2: "slightly greater" effect. Is this even possible to correct for??
} else{LFCb_pred}
}else{ 0 }
fit_pred <- simulate_counts(K=K,B=1,n=n_pred,g=g,
cls=cls_pred,SF=SF_pred,
beta=beta,phi=phi,LFCg_mat=LFCg_mat,#noise_mat=noise_mat,
batch=rep(0,n_pred), batch_effects = batch_effects_pred,sigma_b=sigma_b,sigma_g=sigma_g,DEb_ID=DEb_ID,disp=disp) # gene-specific disp param
cts_pred = fit_pred$cts; mu_mat_pred = fit_pred$mu_mat; sigma_mat_pred = fit_pred$sigma_mat
sim_params=list(K=K,B=B,g=g,n=n,n_pred=n_pred,pK=pK,pB=pB,
LFCg=LFCg,pDEg=pDEg,sigma_g=sigma_g,
LFCb=LFCb,pDEb=pDEb,sigma_b=sigma_b,
batch_effects=batch_effects,batch_effects_pred=batch_effects_pred,
beta=beta,phi=phi,disp=disp,
LFCg_mat=LFCg_mat,
DEb_ID=DEb_ID, mu_mat=mu_mat, mu_mat_pred=mu_mat_pred, sigma_mat=sigma_mat, sigma_mat_pred=sigma_mat_pred)
sim.dat=list(cts=cts, cts_pred=cts_pred,
cls=cls, cls_pred=cls_pred, batch=batch, batch_pred=batch_pred,
SF=SF, SF_pred=SF_pred,
DEg_ID=DEg_ID, sim_params=sim_params)
if(save_file){
save(sim.dat,file=file.name)
}else{all_sim.dats[[i]]=sim.dat}
}
if(save_file){
file.name2=sprintf("%s/%s_dataParams.txt",dir_name,file_name)
sink(file=file.name2)
print(match)
cat(paste("\nB=",B," ,g=",g,", sigma_g=",sigma_g,", sigma_b=",sigma_b,"\n",
sep=""))
cat(paste("LFCb=",LFCb,", pDEb=",pDEb,", disp=",disp,", n_pred=",n_pred,"\n",
sep=""))
cat("\npK:\n")
write.table(pK,quote=F,col.names=F)
cat("\npB:\n")
write.table(pB,quote=F,col.names=F)
sink()
}else{
return(all_sim.dats)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.