R/deg_simulation.R

Defines functions simulate_sample_groups simulate_dropout2 simulate_dropout make_DEG_data2 make_DEG_data make_DEG_pattern

Documented in make_DEG_data make_DEG_data2 make_DEG_pattern simulate_dropout simulate_dropout2 simulate_sample_groups

#' Simulate differentially expressed gene pattern
#' 
#' Generate complicated differentially expressed gene (DEG) pattern to simulate 
#' varied degree of heterogeneity.
#' 
#' The heterogeneity of gene expression pattern is mainly controlled by
#' two parameters: \code{sample.rate} and \code{active.rate}. If both
#' parameters are equal to 1, the gene expression pattern will be homogeneous,
#' otherwise heterogeneous.
#' 
#' 
#' @param n.genes The total number of genes in the simulated data.
#' @param n.samples The total number of samples in the simulated data.
#' @param fold.change The fold change level of DEGs.
#' @param gene.rate The proportion of DEGs to all genes.
#' @param sample.rate The proportion of abnormal samples to all samples.
#' @param active.rate The probability that a DEG is truely differentially 
#' expressed in an abnormal sample.
#' @param up.rate The proportion of up-regulated DEGs to all DEGs.
#' 
#' @return This function will return a list with the following components:
#'   \item{FC}{The matrix of simulated fold changes. Each row represents a gene and each column represents a sample.}
#'   \item{gene}{The vector of gene status: 1 for up-regulated, -1 for down-regulated, and 0 for normal genes.}
#'   \item{sample}{The vector of sample status: 1 for abnormal, and 0 for normal samples.}
#' 
#' @export
make_DEG_pattern <- function(n.genes, n.samples, fold.change = 2, gene.rate = 0.3, sample.rate = 1.0, active.rate = 1.0, up.rate = 0.5)
{
  active.gene <- stats::runif(n.genes) <= gene.rate
  active.sample <- stats::runif(n.samples) <= sample.rate
  if (sum(active.gene) < 1) active.gene[sample.int(n.genes, 1)] <- TRUE
  if (sum(active.sample) < 1) active.sample[sample.int(n.samples, 1)] <- TRUE
  up.gene <- logical(n.genes)
  up.gene[active.gene] <- stats::runif(sum(active.gene)) <= up.rate
  down.gene <- active.gene & !up.gene
  fc.sub <- matrix(fold.change, sum(active.gene), sum(active.sample))
  fc.sub[down.gene[active.gene], ] <- 1/fold.change
  fc.sub[stats::runif(length(fc.sub)) > active.rate] <- 1
  fc <- matrix(1, n.genes, n.samples)
  fc[active.gene, active.sample] <- fc.sub
  return(list(FC=fc, gene=up.gene-down.gene, sample=as.integer(active.sample)))
}


#' Simulate differentially expressed gene data (Gaussian)
#' 
#' Generate differentially expressed gene (DEG) data from Gaussian distribution.
#' 
#' The expression values of each gene are assumed following a Gaussian distribution with 
#' gene-specific mean, which follows a log-normal distribution. The size factor for each
#' sample follows a Gaussian distribution with zero mean and specific standard deviation.
#' The heterogeneity of gene expression data is simulated by using the function \code{\link{make_DEG_pattern}}.
#' 
#' 
#' @param n.genes The total number of genes in the simulated data.
#' @param n.samples.A The number of samples in the group A.
#' @param n.samples.B The number of samples in the group B.
#' @param exp.mean The mean of log-normal distribution that determines gene-specific expression mean.
#' @param exp.sd The standard deviation of log-normal distribution that determines gene-specific expression means.
#' @param alpha The dispersion ratio of gene-specific expression standard deviation to mean.
#' @param size.factor.sd The standard deviation of size factors for samples.
#' @param ... The parameters passed to function \code{\link{make_DEG_pattern}}.
#' 
#' @return This function will return a list with the following components:
#'   \item{DEG}{The matrix of simulated DEG pattern, which is generated by \code{\link{make_DEG_pattern}}.}
#'   \item{countsA}{The expression matrix of group A. Each row represents a gene and each column represents a sample.}
#'   \item{countsB}{The expression matrix of group B. Each row represents a gene and each column represents a sample.}
#' 
#' @export
make_DEG_data <- function(n.genes, n.samples.A, n.samples.B, exp.mean = 8, exp.sd = 2, alpha = 0.2, size.factor.sd = 0.1, ...)
{
  # simulate DEG and heterogeneity
  deg <- make_DEG_pattern(n.genes, n.samples.B, ...)
  # simulate expression mean and dispersion
  mu0 <- 2^stats::rnorm(n.genes, exp.mean, exp.sd)
  sd0 <- alpha * mu0
  # simulate group A
  sfA <- 2^stats::rnorm(n.samples.A, sd = size.factor.sd)
  countsA <- matrix(stats::rnorm(n.genes * n.samples.A, mean = mu0 %*% t(sfA), sd = sd0), nrow = n.genes)
  # simulate group B
  sfB <- 2^stats::rnorm(n.samples.B, sd = size.factor.sd)
  countsB <- matrix(stats::rnorm(n.genes * n.samples.B, mean = mu0 %*% t(sfB) * deg$FC, sd = sd0 * deg$FC), nrow = n.genes)
  countsA[countsA < 0] <- 0
  countsB[countsB < 0] <- 0
  return(list(DEG=deg, countsA=countsA, countsB=countsB, factorA=sfA, factorB=sfB))
}


#' Simulate differentially expressed gene data (Negative binomial)
#' 
#' Generate differentially expressed gene (DEG) data from negative binomial distribution.
#' 
#' The expression values of each gene are assumed following a negative binomial distribution with 
#' gene-specific mean, which follows a log-normal distribution. The size factor for each
#' sample follows a Gaussian distribution with zero mean and specific standard deviation.
#' The heterogeneity of gene expression data is simulated by using the function \code{\link{make_DEG_pattern}}.
#' 
#' 
#' @param n.genes The total number of genes in the simulated data.
#' @param n.samples.A The number of samples in the group A.
#' @param n.samples.B The number of samples in the group B.
#' @param exp.mean The mean of log-normal distribution that determines gene-specific expression mean.
#' @param exp.sd The standard deviation of log-normal distribution that determines gene-specific expression mean.
#' @param dispersion The dispersion parameter for negative binomial distribution. The default values are determined by the expression mean.
#' @param size.factor.sd The standard deviation of size factors for samples.
#' @param ... The parameters passed to function \code{\link{make_DEG_pattern}}.
#' 
#' @return This function will return a list with the following components:
#'   \item{DEG}{The matrix of simulated DEG pattern, which is generated by \code{\link{make_DEG_pattern}}.}
#'   \item{countsA}{The expression matrix of group A. Each row represents a gene and each column represents a sample.}
#'   \item{countsB}{The expression matrix of group B. Each row represents a gene and each column represents a sample.}
#' 
#' @export
make_DEG_data2 <- function(n.genes, n.samples.A, n.samples.B, exp.mean = 8, exp.sd = 2, dispersion = NULL, size.factor.sd = 0.1, ...)
{
# simulate DEG and heterogeneity
  deg <- make_DEG_pattern(n.genes, n.samples.B, ...)
# simulate expression mean
  mu0 <- 2^stats::rnorm(n.genes, exp.mean, exp.sd)
  if (is.null(dispersion)) dispersion <- 4/mu0 + 0.1
# simulate group A
  sfA <- 2^stats::rnorm(n.samples.A, sd = size.factor.sd)
  countsA <- matrix(stats::rnbinom(n.genes * n.samples.A, mu = mu0 %*% t(sfA), size = 1/dispersion), nrow = n.genes)
# simulate group B
  sfB <- 2^stats::rnorm(n.samples.B, sd = size.factor.sd)
  countsB <- matrix(stats::rnbinom(n.genes * n.samples.B, mu = mu0 %*% t(sfB) * deg$FC, size = 1/dispersion), nrow = n.genes)
  return(list(DEG=deg, countsA=countsA, countsB=countsB, factorA=sfA, factorB=sfB))
}


#' Simulate dropout expression data
#' 
#' Generate the expression data with desired dropout rate
#' 
#' The dropout event is modelled by a logistic distribution such that the low expression genes have 
#' higher probability of dropout. The expression value of genes in a sample are randomly set to zero
#' with probabilities associated with their true expression values until the desired dropout rate
#' for that sample is meet.
#' 
#' @param counts expression matrix where each row is a gene and each column is a sample.
#' @param dropout.rate the desired average dropout rate of all samples.
#' @param dropout.rate.sd the desired standard deviation of dropout rate among samples.
#' 
#' @return This function will return a list with the following components:
#'   \item{counts}{The modified expression matrix with the same dimension as input \code{counts}.}
#'   \item{original.counts}{The original input expression matrix.}
#'   \item{dropout}{The binary matrix indicating where the dropout events happen.}
#' 
#' @references Peter V. Kharchenko, Lev Silberstein, and David T. Scadden.
#' Bayesian approach to single-cell differential expression analysis.
#' Nature Methods, 11(7):740–742, 2014.
#' 
#' @export
simulate_dropout <- function(counts, dropout.rate = 0, dropout.rate.sd = 0.1)
{
  n.genes <- dim(counts)[1]
  n.samples <- dim(counts)[2]
  intercept <- stats::rnorm(n.samples, 1.5, 0.5)
  slope <- stats::rnorm(n.samples, 3, 1)
  d <- sapply(1:n.samples, function(i) 1 / (1 + exp(slope[i] * (log10(counts[, i]) - intercept[i]))))
  r <- 2^stats::rnorm(n.samples, sd = dropout.rate.sd) * dropout.rate
  r[r > 1] <- 1
  r[r < 0] <- 0
  dropout <- matrix(0, n.genes, n.samples)
  for (i in 1:n.samples)
  {
    gi <- which(counts[, i] > 0)
    ri <- round(r[i] * length(gi))
    dropout[sample(gi, ri, prob = d[gi, i]), i] <- 1
  }
  new.counts <- counts * (1 - dropout)
  list(counts = new.counts, original.counts = counts, dropout = dropout)
}


#' Simulate dropout expression data
#' 
#' Generate the expression data with desired dropout rate range
#' 
#' The dropout event is modelled by a logistic distribution such that the low expression genes have 
#' higher probability of dropout. The expression value of genes in a sample are randomly set to zero
#' with probabilities associated with their true expression values until the desired dropout rate
#' for that sample is meet.
#' 
#' @param counts expression matrix where each row is a gene and each column is a sample.
#' @param min.rate the minimum dropout rate of all samples.
#' @param max.rate the maximum dropout rate of all samples.
#' 
#' @return This function will return a list with the following components:
#'   \item{counts}{The modified expression matrix with the same dimension as input \code{counts}.}
#'   \item{original.counts}{The original input expression matrix.}
#'   \item{dropout}{The binary matrix indicating where the dropout events happen.}
#' 
#' @references Peter V. Kharchenko, Lev Silberstein, and David T. Scadden.
#' Bayesian approach to single-cell differential expression analysis.
#' Nature Methods, 11(7):740–742, 2014.
#' 
#' @export
simulate_dropout2 <- function(counts, min.rate = 0, max.rate = 0.8)
{
  n.genes <- dim(counts)[1]
  n.samples <- dim(counts)[2]
  intercept <- stats::rnorm(n.samples, 1.5, 0.5)
  slope <- stats::rnorm(n.samples, 3, 1)
  d <- sapply(1:n.samples, function(i) 1 / (1 + exp(slope[i] * (log10(counts[, i]) - intercept[i]))))
  r <- stats::runif(n.samples, min.rate, max.rate)
  r[r > 1] <- 1
  r[r < 0] <- 0
  dropout <- matrix(0, n.genes, n.samples)
  for (i in 1:n.samples)
  {
    gi <- which(counts[, i] > 0)
    ri <- round(r[i] * length(gi))
    dropout[sample(gi, ri, prob = d[gi, i]), i] <- 1
  }
  new.counts <- counts * (1 - dropout)
  list(counts = new.counts, original.counts = counts, dropout = dropout)
}


#' Simulate sample groups from given samples with labels
#' 
#' Generate sample groups with desired labels and sizes from given sample labels.
#' 
#' @param labels a vector containing the label of each sample in the pool.
#' @param groups a vector containing the desired label of samples in each group.
#' The label must be available in the sample pool provided by \code{labels}.
#' @param sizes integer vector indicating the desired number of samples in each group.
#' The length must be either one or the same as \code{groups}.
#' @param replace logical variable indicating whether sampling is with replacement.
#' 
#' @return This function will return a list with the same length as \code{groups}.
#' Each component is a vector containing the indexes of samples that are sampled for
#' the corresponding group.
#' 
#' @export
simulate_sample_groups <- function(labels, groups, sizes, replace = FALSE)
{
  if (sum(groups %in% labels) < length(groups)) stop("The elements in 'groups' must be available labels!")
  if (length(sizes) != length(groups))
  {
    if (length(sizes) == 1) sizes <- rep_len(sizes, length(groups))
    else stop("The lengths of 'groups' and 'sizes' must be equal!")
  }
  if (replace)
  {
    sample.groups <- lapply(1:length(groups), function(i) sample(which(labels == groups[i]), sizes[i], replace = TRUE))
  }
  else
  {
    u.groups <- unique(groups)
    u.sizes <- sapply(u.groups, function(id) sum(sizes[groups == id]))
    total.sizes <- sapply(u.groups, function(id) sum(labels == id))
    if (any(u.sizes > total.sizes)) stop("The total sampling size is too larger when 'replace = FALSE'!")
    u.samples <- lapply(1:length(u.groups), function(i) sample(which(labels == u.groups[i]), u.sizes[i], replace = FALSE))
    sample.groups <- list()
    for (k in 1:length(u.groups))
    {
      k.groups <- groups == u.groups[k]
      k.id <- 1:sum(k.groups)
      partition <- sample(rep(k.id, times = sizes[k.groups]), u.sizes[k])
      sample.groups[k.groups] <- lapply(k.id, function(i) u.samples[[k]][partition == i])
    }
  }
  names(sample.groups) <- groups
  sample.groups
}

Try the Corbi package in your browser

Any scripts or data that you put into this service are public.

Corbi documentation built on May 3, 2022, 3:01 a.m.