R/makeExampleBASiCS_Data.R

Defines functions makeExampleBASiCS_Data

Documented in makeExampleBASiCS_Data

#' @name makeExampleBASiCS_Data
#'
#' @title Create a synthetic SingleCellExperiment example object with the
#' format required for BASiCS
#'
#' @description A synthetic \code{\linkS4class{SingleCellExperiment}} object is
#' generated by simulating a dataset from the model underlying BASiCS. This is
#' used to illustrate BASiCS in some of the package and vignette examples.
#'
#' @param WithBatch If \code{TRUE}, 2 batches are generated (each of them
#' containing 15 cells). Default: \code{WithBatch = FALSE}.
#' @param WithSpikes If \code{TRUE}, the simulated dataset contains 20 spike-in
#' genes. If \code{WithSpikes = FALSE}, \code{WithBatch} is automatically set
#' to \code{TRUE}. Default: \code{WithSpikes = TRUE}
#'
#' @return An object of class \code{\linkS4class{SingleCellExperiment}}, with
#' synthetic data simulated from the model implemented in BASiCS.
#' If \code{WithSpikes = TRUE}, it contains 70 genes (50 biological and
#' 20 spike-in) and 30 cells. Alternatively, it contains 50 biological
#' genes and 30 cells.
#'
#' @examples
#' Data <- makeExampleBASiCS_Data()
#' is(Data, 'SingleCellExperiment')
#' 
#' @details Note: In BASiCS versions < 1.5.22, \code{makeExampleBASiCS_Data} 
#' used a fixed seed within the function. This has been removed to comply with 
#' Bioconductor policies. If a reproducible example is required, please use 
#' \code{set.seed} prior to calling \code{makeExampleBASiCS_Data} .
#'
#' @author Catalina A. Vallejos \email{cnvallej@@uc.cl}
#' @author Nils Eling \email{eling@@ebi.ac.uk}
#'
#' @export
makeExampleBASiCS_Data <- function(WithBatch = FALSE,
                                   WithSpikes = TRUE)
{
  Mu <- c( 8.36, 10.65,  4.88,  6.29, 21.72, 12.93, 30.19, 83.92,  3.89,  6.34,
          57.87, 12.45,  8.08,  7.31, 15.56, 15.91, 12.24, 15.96, 19.14,  4.20,
           6.75, 27.74,  8.88, 21.21, 19.89,  7.14, 11.09,  7.19, 20.64,  73.9,
           9.05,  6.13, 16.40,  6.08, 17.89,  6.98, 10.25, 14.05,  8.14,  5.67,
           6.95, 11.16, 11.83,  7.56,159.05, 16.41,  4.58, 15.46, 10.96, 25.05,
          18.54,   8.5,  4.05,  5.37,  4.63,  4.08,  3.75,  5.02, 27.74, 10.28,
           3.91, 13.10,  8.23,  3.64, 80.77, 20.36,  3.47, 20.12, 13.29,  7.92,
          25.51,173.01, 27.71,  4.89, 33.10,  3.42,  6.19,  4.29,  5.19,  8.36,
          10.27,  6.86,  5.72, 37.25,  3.82, 23.97,  5.80, 14.30, 29.07,  5.30,
           7.47,  8.44,  4.24, 16.15, 23.39,120.22,  8.92, 97.15,  9.75, 10.07,
        1010.72,  7.90, 31.59, 63.17,  1.97,252.68, 31.59, 31.59, 31.59, 63.17,
       4042.89,4042.89,  3.95,126.34,252.68,  7.90, 15.79,126.34,  3.95, 15.79)
  Delta <- c(1.29, 0.88, 1.51, 1.49, 0.54, 0.40, 0.85, 0.27, 0.53, 1.31,
             0.26, 0.81, 0.72, 0.70, 0.96, 0.58, 1.15, 0.82, 0.25, 5.32,
             1.13, 0.31, 0.66, 0.27, 0.76, 1.39, 1.18, 1.57, 0.55, 0.17,
             1.40, 1.47, 0.57, 2.55, 0.62, 0.77, 1.47, 0.91, 1.53, 2.89,
             1.43, 0.77, 1.37, 0.57, 0.15, 0.33, 3.99, 0.47, 0.87, 0.86,
             0.97, 1.25, 2.20, 2.19, 1.26, 1.89, 1.70, 1.89, 0.69, 1.63,
             2.83, 0.29, 1.21, 2.06, 0.20, 0.34, 0.71, 0.61, 0.71, 1.20,
             0.88, 0.17, 0.25, 1.48, 0.31, 2.49, 2.75, 1.43, 2.65, 1.97,
             0.84, 0.81, 2.75, 0.53, 2.23, 0.45, 1.87, 0.74, 0.53, 0.58,
             0.94, 0.72, 2.61, 1.56, 0.37, 0.07, 0.90, 0.12, 0.76, 1.45)
  Phi <- c(1.09, 1.16, 1.19, 1.14, 0.87, 1.10, 0.48, 1.06, 0.94, 0.97,
           1.09, 1.16, 1.19, 1.14, 0.87, 1.10, 0.48, 1.06, 0.94, 0.97,
           1.09, 1.16, 1.19, 1.14, 0.87, 1.10, 0.48, 1.06, 0.94, 0.97)
  S <- c(0.38, 0.40, 0.38, 0.39, 0.34, 0.39, 0.31, 0.39, 0.40, 0.37,
         0.38, 0.40, 0.38, 0.39, 0.34, 0.39, 0.31, 0.39, 0.40, 0.37,
         0.38, 0.40, 0.38, 0.39, 0.34, 0.39, 0.31, 0.39, 0.40, 0.37)

  Mu <- c(Mu[seq_len(50)], Mu[seq_len(20)+100])
  Delta <- Delta[seq_len(50)]

  q <- length(Mu)
  q.bio <- length(Delta)
  n <- length(Phi)

  if (!WithBatch) {
    Theta <- 0.5
  } else {
    # 2 batches, 10 cells each
    Theta1 <- 0.50
    Theta2 <- 0.75
    Theta <- ifelse(seq_len(n) <= 15, Theta1, Theta2)
  }

  if (WithSpikes) {
    # Matrix where simulated counts will be stored
    Counts.sim <- matrix(0, ncol = n, nrow = q)
    # Matrix where gene-cell specific simulated random effects will be stored
    Rho <- matrix(1, ncol = n, nrow = q.bio)
    # Simulated cell-specific random effects
    if (all(Theta > 0)) {
      Nu <- rgamma(n, shape = 1 / Theta, rate = 1 / (S * Theta))
    } else {
      Nu <- S
    }

    # Simulated counts data
    for (i in seq_len(q)) {
      if (i <= q.bio) {
        # Biological genes
        if (Delta[i] > 0) {
          Rho[i, ] <- rgamma(n, shape = 1 / Delta[i], rate = 1 / Delta[i])
        }
        Counts.sim[i, ] <- rpois(n, lambda = Phi * Nu * Rho[i, ] * Mu[i])
      } else {
        # Technical genes
        Counts.sim[i, ] <- rpois(n, lambda = Nu * Mu[i])
      }
    }

    rownames(Counts.sim) <- c(paste0("Gene", seq_len(q.bio)),
                              paste0("ERCC", seq_len(q-q.bio)))
    SpikeInfo <- data.frame(Names = paste0("ERCC", seq_len(q-q.bio)),
                            count = Mu[seq(q.bio + 1, q)])

    colnames(Counts.sim) <- paste0("Cell", seq_len(n))

    # Create the initial SCE object
    Data <- SingleCellExperiment(
      assays = list(counts = Counts.sim[seq_len(q.bio),])
    )
    # Add batch info if available
    if (WithBatch) {
      BatchInfo <- c(rep(1, 15), rep(2, 15))
    } else {
      BatchInfo <- rep(1, 30)  
    }
    colData(Data) <- S4Vectors::DataFrame(
      BatchInfo = BatchInfo,
      row.names = colnames(Counts.sim)
    )
    # Add spike-in counts and metadata
    altExp(Data, "spike-ins") <- SummarizedExperiment(
      assays = list(counts = Counts.sim[seq_len(q-q.bio) + q.bio,])
    )
    rowData(altExp(Data)) <- SpikeInfo
  } else {
    # Matrix where simulated counts will be stored
    Counts.sim <- matrix(0, ncol = n, nrow = q.bio)
    # Matrix where gene-cell specific simulated random effects will be stored
    Rho <- matrix(1, ncol = n, nrow = q.bio)
    # Simulated cell-specific random effects
    Phi[seq_len(15)] <- 2 * Phi[seq_len(15)]
    if (all(Theta > 0)) {
      Nu <- rgamma(n, shape = 1 / Theta, rate = 1 / (Phi * Theta))
    } else {
      Nu <- Phi
    }

    # Simulated counts data
    for (i in seq_len(q.bio)) {
      if (Delta[i] > 0) {
        Rho[i, ] <- rgamma(n, shape = 1 / Delta[i], rate = 1 / Delta[i])
      }
      Counts.sim[i, ] <- rpois(n, lambda = Nu * Rho[i, ] * Mu[i])
    }
    rownames(Counts.sim) <- paste0("Gene", seq_len(q.bio))
    colnames(Counts.sim) <- paste0("Cell", seq_len(n))

    Data <- SingleCellExperiment(
      assays = list(counts = Counts.sim)
    )
    colData(Data) <- S4Vectors::DataFrame(
      BatchInfo = c(rep(1, 15), rep(2, 15)),
      row.names = colnames(Counts.sim)
    )
  }
  return(Data)
}
catavallejos/BASiCS documentation built on March 27, 2024, 12:49 a.m.