R/create_bulks.R

Defines functions create_bulks

Documented in create_bulks

#' simulate bulk data by summing over single-cell data
#'
#' @param exprs non-negative numeric, scRNA-seq profiles as columns,
#' features as rows
#' @param pheno data.frame, phenotype data containing
#' cell type labels for the expression matrix,
#' must contain 'cell.type.column',
#' ordering of cells must be the same as in exprs
#' @param n.bulks integer, the number of bulks to be created, defaults to 500
#' @param include.in.bulks vector of strings, cell types to be used for bulk
#' simulation; if not supplied, all will be used
#' @param n.profiles.per.bulk positive numeric, number of samples to be randomly
#' drawn for each bulk; default 1000
#' @param sum.to.count boolean, should all bulks be normalized
#' to a fixed total count number (1e6)? default TRUE
#' @param cell.type.column string, which column of 'pheno'
#' holds the cell type information? default "cell_type"
#' @param frac numeric >= 1; determines the variance of cell type proportions
#' within the simulated bulks; default 5
#' @param switch.prob numeric, 0 <= switch.prob <= 1, fraction of bulks
#' sampled with inverse cell type quantities; default 0
#' @return list with
#'    - "bulks": matrix containing bulk expression profiles (features x bulks)
#'    - "props" matrix containing quantities (cell type x bulks)
#' @example create_bulks(training.exprs, training.pheno, n.bulks = 1000)
#' @export

create_bulks <- function(
  exprs,
  pheno,
  cell.type.column = "cell_type",
  n.bulks = 500,
  include.in.bulks = NULL,
  n.profiles.per.bulk = 1000,
  sum.to.count = TRUE,
  frac = 5,
  switch.prob = 0
) {
  # parameter checks
  if (nrow(pheno) != ncol(exprs)) {
    stop("Number of columns in exprs and rows in pheno do not match")
  }
  if (!is.numeric(n.bulks)) {
    stop("n.bulks must be numeric")
  }
  if (!is.numeric(n.profiles.per.bulk)) {
    stop("n.profiles.per.bulk must be numeric > 0")
  }else{
    if (n.profiles.per.bulk <= 0) {
      stop("n.profiles.per.bulk must be numeric > 0")
    }
  }
  rownames(pheno) <- colnames(exprs)

  # keep only specified cell types
  if (is.null(include.in.bulks)) {
    include.in.bulks <- unique(pheno[, cell.type.column])
  }
  to.remove <- which(pheno[, cell.type.column] == "unassigned")
  if (length(to.remove) > 0) {
    pheno <- pheno[-to.remove, ]
    exprs <- exprs[, -to.remove]
  }

  cell.types <- pheno[, cell.type.column]
  if (length(which(cell.types %in% include.in.bulks)) > 0) {
    pheno <- pheno[which(cell.types %in% include.in.bulks), , drop = FALSE]
  }
  exprs <- exprs[, rownames(pheno), drop = FALSE]

  # create a matrix to contain the bulk expression profiles
  bulk.exprs <- matrix(
    0,
    nrow = nrow(exprs),
    ncol = n.bulks
  )
  rownames(bulk.exprs) <- rownames(exprs)
  colnames(bulk.exprs) <- as.character(1:n.bulks)

  # create a matrix to contain true proportions for each simulated bulk
  props <- matrix(0,
    nrow = length(unique(pheno[, cell.type.column])),
    ncol = n.bulks
  )
  rownames(props) <- unique(pheno[, cell.type.column])
  colnames(props) <- colnames(bulk.exprs)

  types <- unique(pheno[, cell.type.column])
  types <- sample(types, length(types), replace = FALSE)

  # sample random weights for each type
  ct.fracs <- sort(
    table(pheno[, cell.type.column]) / nrow(pheno) * 100,
    decreasing = TRUE
  )

  if (switch.prob > 0) {
    ct.fracs.2 <- 100 - ct.fracs
    names(ct.fracs.2) <- types
  }else{
    ct.fracs.2 <- ct.fracs
  }

  for (i in seq_len(n.bulks)) {
    bulk.samples <- c()
    weights <- c()
    if (runif(1, 0, 1) < switch.prob) {
      switch <- TRUE
    }else{
      switch <- FALSE
    }
    for (ct in types) {
      if (switch) {
        ct.frac <- ct.fracs[ct]
      }else{
        ct.frac <- ct.fracs.2[ct]
      }

      if (frac / 2 > ct.frac) {
        frac.min <- 1
        frac.max <- frac
      }else if (frac / 2 > 100 - ct.frac) {
        frac.max <- 100
        frac.min <- 100 - frac
      }else{
        frac.min <- ct.frac - frac / 2
        frac.max <- ct.frac + frac / 2
      }
      frac.min <- as.integer(frac.min)
      frac.max <- as.integer(frac.max)
      w <-  sample(frac.min:frac.max, 1)

      if (w < 1) w <- 1
      if (w > 100) w <- 100
      weights <- c(weights, w)
    }
    names(weights) <- types
    if (any(weights < 0)) {
      weights[weights < 0] <- 0
    }
    weights <- weights / sum(weights)

    # determine, how many samples of each type should be drawn
    n.samples <- sample(
      types,
      size = ceiling(n.profiles.per.bulk),
      replace = TRUE,
      prob = weights
    )
    type.table <- table(n.samples)

    # draw samples for each type, store the proportions and the expression
    for (t in types) {
      if (t %in% names(type.table)) {
        coarse.samples <- sample(
          which(pheno[,cell.type.column] == t),
          size = type.table[t],
          replace = TRUE
        )
        bulk.samples <- c(bulk.samples, coarse.samples)
        props[t, i] <- length(coarse.samples)
      }
    }
    props[, i] <- props[, i] / length(bulk.samples)

    bulk.exprs[, i] <- Matrix::rowSums(exprs[, bulk.samples, drop = F])
  }

  # let no feature occur twice in the bulks
  if (any(duplicated(rownames(bulk.exprs)))) {
    warning("Found duplicate features in simulated bulks. Removing...")
    bulk.exprs <- bulk.exprs[-which(duplicated(bulk.exprs)), ]
  }

  # sum bulks to fixed total count per profile if sum.to.count is true
  if (sum.to.count) {
    bulk.exprs <- scale_to_count(bulk.exprs)
  }

  return(list(bulks = bulk.exprs, props = props))
}
MarianSchoen/DMC documentation built on Aug. 2, 2022, 3:05 p.m.