R/simulation.R

Defines functions Eval_sim_boxplot Twocelltype.Generator

Documented in Twocelltype.Generator

# Simulation functions
#
# Author: Xuran Wang
##################################################################################

#' Simulate Single cell read counts
#'
#' Simulate expected library sizes from a log-normal distribution
#'
#' @param N integer, number of subjects in total.
#' @param n.bulk integer, number of bulk subjects.
#' @param G integer, number of genes.
#' @param mu matrix, 2 by G, the mean expression of genes in each cell types.
#' @param sub.var matrix, 2 by G, scale of log-normal distribution for each subjects at genes. Part of them are zeros.
#' @param sort.bulk logical, default is FALSE. Sort the cell type proportion.
#'
#' @details
#' Generate simulation datasets. n.sc or n.bulk is zero, then we only generate
#' one dataset and cell type proportions.
#'
#' @return a list with elements
#'    * cell type proportions of generated bulk data
#'    * cell type proportions of generated single cell data
#'    * ExpressionSet of bulk data
#'    * ExpressionSet of single cell data
#'
#' @importFrom MCMCpack rdirichlet
#' @importFrom stats rlnorm
#'
#' @export
Twocelltype.Generator = function(n.sc = 10, n.bulk = 100, G = 100, mu, sub.var, sort.bulk = FALSE, sigma = NULL, ...){
  K = 2; #2 cell types
  N = n.sc + n.bulk

  #generate cell number and cell type proportion
  Ni = sample(700:2000, N, replace = T)
  p = rdirichlet(N, rep(4, K))
  Nk = cbind(round(p[,1]*Ni), round(p[,2]*Ni))
  p = Nk/Ni

  muA = mu[1, ]   #c(rep(0,10), runif(90, 20, 70))
  muB = mu[2, ]   #c(rep(50, 10), rep(0, 90)) + muA
  #sf = sample(1:10, 5); muA[sf] = 50; muB[sf] = 0;

  # Generate subject level variation
  sub.Fac = list(sapply(1:G, function(i){rlnorm(N, 0, sub.var[1, i])}),
                 sapply(1:G, function(i){rlnorm(N, 0, sub.var[2, i])}) )
  GN = NULL;
  for(g in 1:G){
    GN = c(GN, paste('gene', g, sep = ''))
  }
  if(G>9){
    GN[1:9] = c('gene01', 'gene02', 'gene03', 'gene04', 'gene05', 'gene06', 'gene07', 'gene08', 'gene09');
  }else{
    GN[1:G] = paste0('gene0', 1:G)
  }

  Bulk.null = FALSE; SC.null = FALSE;
  if(n.bulk == 0) Bulk.null = TRUE;
  if(n.sc == 0) SC.null = TRUE;

  if(is.null(sigma)){
    if(!Bulk.null){
      Xjg = NULL    # generate bulk tissue data
      for(i in 1:n.bulk){
        Xjg = cbind( Xjg, colSums( sapply(muA*sub.Fac[[1]][i, ], rpois, n = Nk[i,1]) )
                     + colSums( sapply(muB*sub.Fac[[2]][i, ], rpois, n = Nk[i,2]) ) )
      }
    }
    if(!SC.null){
      nk = NULL; #S = NULL; thetaA = NULL; thetaB = NULL;   # generate single cell statistics
      Xjgc = NULL; #Xsc.cov = NULL# generate single cell data
      for(i in (1+n.bulk):N){
        nk1 = max(rbinom(1, Nk[i, 1], 0.1), 2)
        Xjgc1 = t( sapply(muA*sub.Fac[[1]][i, ], rpois, n = nk1) )
        #thetaA = cbind(thetaA, rowSums(Xjgc1)/sum(Xjgc1))

        nk2 = max(rbinom(1, Nk[i, 2], 0.1), 2)
        Xjgc2 =t( sapply(muB*sub.Fac[[2]][i, ], rpois, n = nk2) )
        #thetaB = cbind(thetaB, rowSums(Xjgc2)/sum(Xjgc2))

        #S =rbind(S, c( mean( colSums(Xjgc1)), mean( colSums(Xjgc2))) )

        nk = rbind(nk, c(nk1, nk2));
        Xjgc = cbind(Xjgc, Xjgc1, Xjgc2);
        #Xsc.cov = cbind(Xsc.cov, rbind(rep(paste('indv', i-100, sep = '')), c(rep('A', nk1), rep('B', nk2) )) )
      }
    }
  }else{
    lambdaA = sapply(sigma/muA, rgamma, n = N, shape = sigma)
    lambdaB = sapply(sigma/muB, rgamma, n = N, shape = sigma)
    if(!Bulk.null){
      Xjg = NULL    # generate bulk tissue data
      for(i in 1:n.bulk){
        Xjg = cbind( Xjg, colSums( sapply(lambdaA[i, ] * sub.Fac[[1]][i, ], rpois, n = Nk[i,1]) )
                     + colSums( sapply(lambdaB[i, ] * sub.Fac[[2]][i, ], rpois, n = Nk[i,2]) ) )
      }
    }
    if(!SC.null){
      nk = NULL; #S = NULL; thetaA = NULL; thetaB = NULL;   # generate single cell statistics
      Xjgc = NULL; #Xsc.cov = NULL# generate single cell data
      for(i in (1+n.bulk):N){
        nk1 = max(rbinom(1, Nk[i, 1], 0.1), 2)
        Xjgc1 = t( sapply(lambdaA[i, ] * sub.Fac[[1]][i, ], rpois, n = nk1) )
        #thetaA = cbind(thetaA, rowSums(Xjgc1)/sum(Xjgc1))

        nk2 = max(rbinom(1, Nk[i, 2], 0.1), 2)
        Xjgc2 =t( sapply(lambdaB[i, ]*sub.Fac[[2]][i, ], rpois, n = nk2) )
        #thetaB = cbind(thetaB, rowSums(Xjgc2)/sum(Xjgc2))

        #S =rbind(S, c( mean( colSums(Xjgc1)), mean( colSums(Xjgc2))) )

        nk = rbind(nk, c(nk1, nk2));
        Xjgc = cbind(Xjgc, Xjgc1, Xjgc2);
        #Xsc.cov = cbind(Xsc.cov, rbind(rep(paste('indv', i-100, sep = '')), c(rep('A', nk1), rep('B', nk2) )) )
      }
    }
  }
  if(!SC.null){
    stop.idx = cumsum( rowSums(nk) )
    start.idx = c(1, stop.idx[-(N-n.bulk)]+1)
    idx.level = unlist(sapply(1:(N-n.bulk), function(i){rep(i, rowSums(nk)[i])}))
    ct.level = unlist(sapply(1:(N-n.bulk), function(i){c(rep(1, nk[i,1]), rep(2, nk[i, 2]))}))
  }
  if(!Bulk.null){
    colnames(p) = c('A', 'B');
    p.bulk = p[1:n.bulk, ];
    if(sort.bulk){
      op.bulk = order(p.bulk[,1])
      p.bulk = p.bulk[op.bulk, ]; rownames(p.bulk) = paste0('Bulk', 1:n.bulk)
      Xjg = Xjg[, op.bulk]; colnames(Xjg) <- paste0('Bulk', 1:n.bulk)
    }else{
      rownames(p.bulk) = paste0('Bulk', 1:n.bulk)
      colnames(Xjg) <- paste0('Bulk', 1:n.bulk)
    }
    rownames(Xjg) <- GN;

    Bulk.pData = data.frame(sampleID = 1:n.bulk, SubjectName = factor(colnames(Xjg), levels = colnames(Xjg)) )
    rownames(Bulk.pData) = colnames(Xjg);
    metadata <- data.frame(labelDescription = c("Sample ID", "Subject Name"), row.names = c("sampleID", "SubjectName"))
    Bulk.eset <- ExpressionSet(assayData = data.matrix(Xjg), phenoData = new("AnnotatedDataFrame", data = Bulk.pData, varMetadata = metadata))
  }

  if(!SC.null){
    p.sc = p[(n.bulk+1):N, ]; rownames(p.sc) = paste0("Sub", (n.bulk+1):N);
    rownames(Xjgc) <- GN
    colnames(Xjgc) <- paste0('cell', 1:ncol(Xjgc))
    SC.pData = data.frame(sampleID = ((n.bulk+1):N)[idx.level], SubjectName = factor(paste0("Sub", (n.bulk+1):N)[idx.level], levels = paste0("Sub", (n.bulk+1):N)),
                          cellTypeID = ct.level, cellType = c('A', 'B')[ct.level])

    rownames(SC.pData) = colnames(Xjgc)

    metadata <- data.frame(labelDescription= c("Sample ID", "Subject Name", "Cell Type ID", "Cell Type Name"),
                           row.names=c("sampleID", "SubjectName","cellTypeID", "cellType"))

    SC.eset = ExpressionSet(assayData = data.matrix(Xjgc), phenoData = new("AnnotatedDataFrame", data = SC.pData, varMetadata = metadata))
  }
  if(SC.null){
    p.sc = NULL; SC.eset = NULL;
  }
  if(Bulk.null){
    p.bulk = NULL; Bulk.eset = NULL;
  }
  out = list(p.bulk = p.bulk, p.sc = p.sc, Bulk.eset = Bulk.eset, SC.eset = SC.eset)
  return(out)
}


Eval_sim_boxplot = function(list, title.sup = ''){
  par(mfrow = c(3, 1))
  boxplot(t(list$R.sim), main = paste0('Boxplot of Pearson Correlation, ', title.sup))
  lines(c(-10, 100), rep(max(apply(list$R.sim, 1 , median)), 2), col = 2 )

  boxplot(t(list$RMSD.sim), main = paste0('Boxplot of RMSD, ', title.sup))
  lines(c(-10, 100), rep(min(apply(list$RMSD.sim, 1 , median)), 2), col = 2 )

  boxplot(t(list$mAD.sim), main = paste0('Boxplot of mAD, ', title.sup))
  lines(c(-10, 100), rep(min(apply(list$mAD.sim, 1 , median)), 2), col = 2 )
}
nkoneill/mMuSiC documentation built on Oct. 25, 2023, 2:39 a.m.