R/simulation.R

Defines functions SimulateData SimulateBarcode

Documented in SimulateBarcode SimulateData

#' Simulate barcode for decomposition illustration
#'
#' Generates a nucleotide barcode similar to those generated by
#' 10x chromium sequencing platforms for illustration purposes.
#' Generates barcode and individual ID separated by '-' delimiter.
#'
#' @param index Integer. Index of cell ID from 0 to barcode.length to the
#'   fourth power. Will generate a unique nucleotide barcode for each
#'   index.
#' @param individual Character. ID of individual that the cell is from.
#' @param barcode.length Integer. Length of nucleotide barcode.
#' @return Simulated barcode for cell from an individual
SimulateBarcode <- function(index, individual, barcode.length){
  index <- index - 1
  barcode <- rep(0, barcode.length)
  for (i in 1:barcode.length){
    barcode[i] <- index %% 4
    index <- index %/% 4
  }
  barcode <- plyr::mapvalues(barcode, from=0:3, to=c("A","T","C","G"),
                             warn_missing = FALSE)
  barcode <- paste(c(barcode,'-', individual), collapse="")
  return(barcode)
}

#' Simulate data for decomposition illustration
#'
#' Simulates bulk and single-cell expression, as well as marker genes and
#' true proportions that can be used as an example of decomposition
#'
#' @param n.ind Integer. Number of individuals to simulate
#' @param n.genes Integer. Number of genes to simulate
#' @param n.cells Integer. Number of cells per individual for single-cell data
#' @param cell.types Character vector. List of cell types to simulate
#' @param avg.props Numeric vector. List of average proportions for given 
#'   cell types. Should be same length as cell.types and sum to 1
#' @return A list with simulated single-cell in slot `sc.eset` and bulk in
#'   `bulk.eset`, as well as true proportions in `props` and marker genes
#'   in `markers`.
#' @examples
#' library(Biobase)
#' sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
#'                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
#'                          avg.props=c(.5, .3, .2))
#'
#' @export
SimulateData <- function(n.ind, n.genes, n.cells, cell.types, avg.props){
  true.props <- stats::rmultinom(n.ind, n.cells, avg.props)/n.cells
  colnames(true.props) <- as.character(1:n.ind)
  rownames(true.props) <- cell.types
  reference <- replicate(length(cell.types), stats::rgeom(n.genes, prob=0.5))
  colnames(reference) <- cell.types
  rownames(reference) <- paste("Gene", 1:n.genes)
  markers <- data.frame(gene=paste("Gene", 1:n.genes),
                        cluster=sapply(1:n.genes,
                                       function(x){
                                         names(which.max(reference[x,]))
                                       }),
                        avg_logFC=sapply(1:n.genes,
                                         function(x){
                                           cell.type <- names(which.max(reference[x,]))
                                           max.expr <- reference[x,cell.type]
                                           avg.expr <- mean(reference[x,colnames(reference) != cell.type])
                                           return(log(max.expr/avg.expr))
                                         }))
  barcode.length <- max(ceiling(log(n.cells, base=4)), 4)
  sc.data <- NULL
  bulk.data <- NULL
  individual.labels <- NULL
  cell.type.labels <- NULL
  for (ind in 1:n.ind){
    counts <- NULL
    for (cell.type in cell.types){
      n.cells.type <- round(n.cells*true.props[cell.type,ind])
      if (n.cells.type > 0) {
        new.counts <- t(sapply(reference[,cell.type], 
                               function(l){
                                 stats::rpois(n.cells.type,l)
                               }))
        if (n.cells.type == 1){
          new.counts <- t(new.counts)
        }
        counts <- cbind(counts, new.counts)
        cell.type.labels <- c(cell.type.labels, rep(cell.type, n.cells.type))
      }
    }
    colnames(counts) <- sapply(1:n.cells, function(x) SimulateBarcode(x, ind, barcode.length))
    bulk.data <- cbind(bulk.data, rowSums(counts))
    sc.data <- cbind(sc.data, counts)
    individual.labels <- c(individual.labels, rep(as.character(ind), n.cells))
  }
  rownames(bulk.data) <- paste("Gene", 1:n.genes)
  rownames(sc.data) <- paste("Gene", 1:n.genes)
  sc.pheno <- data.frame(check.names=F, check.rows=F,
                         stringsAsFactors=F,
                         row.names=colnames(sc.data),
                         SubjectName=individual.labels,
                         cellType=cell.type.labels)
  sc.meta <- data.frame(labelDescription=c("SubjectName",
                                           "cellType"),
                        row.names=c("SubjectName",
                                    "cellType"))
  sc.pdata <- methods::new("AnnotatedDataFrame",
                           data=sc.pheno,
                           varMetadata=sc.meta)
  sc.eset <- Biobase::ExpressionSet(assayData=sc.data,
                                    phenoData=sc.pdata)
  bulk.eset <- Biobase::ExpressionSet(assayData=bulk.data)
  return(list(sc.eset=sc.eset, bulk.eset=bulk.eset, props=true.props, markers=markers))
}
cozygene/bisque documentation built on May 28, 2021, 7:57 p.m.