Nothing
#' Simulate differentially expressed gene pattern
#'
#' Generate complicated differentially expressed gene (DEG) pattern to simulate
#' varied degree of heterogeneity.
#'
#' The heterogeneity of gene expression pattern is mainly controlled by
#' two parameters: \code{sample.rate} and \code{active.rate}. If both
#' parameters are equal to 1, the gene expression pattern will be homogeneous,
#' otherwise heterogeneous.
#'
#'
#' @param n.genes The total number of genes in the simulated data.
#' @param n.samples The total number of samples in the simulated data.
#' @param fold.change The fold change level of DEGs.
#' @param gene.rate The proportion of DEGs to all genes.
#' @param sample.rate The proportion of abnormal samples to all samples.
#' @param active.rate The probability that a DEG is truely differentially
#' expressed in an abnormal sample.
#' @param up.rate The proportion of up-regulated DEGs to all DEGs.
#'
#' @return This function will return a list with the following components:
#' \item{FC}{The matrix of simulated fold changes. Each row represents a gene and each column represents a sample.}
#' \item{gene}{The vector of gene status: 1 for up-regulated, -1 for down-regulated, and 0 for normal genes.}
#' \item{sample}{The vector of sample status: 1 for abnormal, and 0 for normal samples.}
#'
#' @export
make_DEG_pattern <- function(n.genes, n.samples, fold.change = 2, gene.rate = 0.3, sample.rate = 1.0, active.rate = 1.0, up.rate = 0.5)
{
active.gene <- stats::runif(n.genes) <= gene.rate
active.sample <- stats::runif(n.samples) <= sample.rate
if (sum(active.gene) < 1) active.gene[sample.int(n.genes, 1)] <- TRUE
if (sum(active.sample) < 1) active.sample[sample.int(n.samples, 1)] <- TRUE
up.gene <- logical(n.genes)
up.gene[active.gene] <- stats::runif(sum(active.gene)) <= up.rate
down.gene <- active.gene & !up.gene
fc.sub <- matrix(fold.change, sum(active.gene), sum(active.sample))
fc.sub[down.gene[active.gene], ] <- 1/fold.change
fc.sub[stats::runif(length(fc.sub)) > active.rate] <- 1
fc <- matrix(1, n.genes, n.samples)
fc[active.gene, active.sample] <- fc.sub
return(list(FC=fc, gene=up.gene-down.gene, sample=as.integer(active.sample)))
}
#' Simulate differentially expressed gene data (Gaussian)
#'
#' Generate differentially expressed gene (DEG) data from Gaussian distribution.
#'
#' The expression values of each gene are assumed following a Gaussian distribution with
#' gene-specific mean, which follows a log-normal distribution. The size factor for each
#' sample follows a Gaussian distribution with zero mean and specific standard deviation.
#' The heterogeneity of gene expression data is simulated by using the function \code{\link{make_DEG_pattern}}.
#'
#'
#' @param n.genes The total number of genes in the simulated data.
#' @param n.samples.A The number of samples in the group A.
#' @param n.samples.B The number of samples in the group B.
#' @param exp.mean The mean of log-normal distribution that determines gene-specific expression mean.
#' @param exp.sd The standard deviation of log-normal distribution that determines gene-specific expression means.
#' @param alpha The dispersion ratio of gene-specific expression standard deviation to mean.
#' @param size.factor.sd The standard deviation of size factors for samples.
#' @param ... The parameters passed to function \code{\link{make_DEG_pattern}}.
#'
#' @return This function will return a list with the following components:
#' \item{DEG}{The matrix of simulated DEG pattern, which is generated by \code{\link{make_DEG_pattern}}.}
#' \item{countsA}{The expression matrix of group A. Each row represents a gene and each column represents a sample.}
#' \item{countsB}{The expression matrix of group B. Each row represents a gene and each column represents a sample.}
#'
#' @export
make_DEG_data <- function(n.genes, n.samples.A, n.samples.B, exp.mean = 8, exp.sd = 2, alpha = 0.2, size.factor.sd = 0.1, ...)
{
# simulate DEG and heterogeneity
deg <- make_DEG_pattern(n.genes, n.samples.B, ...)
# simulate expression mean and dispersion
mu0 <- 2^stats::rnorm(n.genes, exp.mean, exp.sd)
sd0 <- alpha * mu0
# simulate group A
sfA <- 2^stats::rnorm(n.samples.A, sd = size.factor.sd)
countsA <- matrix(stats::rnorm(n.genes * n.samples.A, mean = mu0 %*% t(sfA), sd = sd0), nrow = n.genes)
# simulate group B
sfB <- 2^stats::rnorm(n.samples.B, sd = size.factor.sd)
countsB <- matrix(stats::rnorm(n.genes * n.samples.B, mean = mu0 %*% t(sfB) * deg$FC, sd = sd0 * deg$FC), nrow = n.genes)
countsA[countsA < 0] <- 0
countsB[countsB < 0] <- 0
return(list(DEG=deg, countsA=countsA, countsB=countsB, factorA=sfA, factorB=sfB))
}
#' Simulate differentially expressed gene data (Negative binomial)
#'
#' Generate differentially expressed gene (DEG) data from negative binomial distribution.
#'
#' The expression values of each gene are assumed following a negative binomial distribution with
#' gene-specific mean, which follows a log-normal distribution. The size factor for each
#' sample follows a Gaussian distribution with zero mean and specific standard deviation.
#' The heterogeneity of gene expression data is simulated by using the function \code{\link{make_DEG_pattern}}.
#'
#'
#' @param n.genes The total number of genes in the simulated data.
#' @param n.samples.A The number of samples in the group A.
#' @param n.samples.B The number of samples in the group B.
#' @param exp.mean The mean of log-normal distribution that determines gene-specific expression mean.
#' @param exp.sd The standard deviation of log-normal distribution that determines gene-specific expression mean.
#' @param dispersion The dispersion parameter for negative binomial distribution. The default values are determined by the expression mean.
#' @param size.factor.sd The standard deviation of size factors for samples.
#' @param ... The parameters passed to function \code{\link{make_DEG_pattern}}.
#'
#' @return This function will return a list with the following components:
#' \item{DEG}{The matrix of simulated DEG pattern, which is generated by \code{\link{make_DEG_pattern}}.}
#' \item{countsA}{The expression matrix of group A. Each row represents a gene and each column represents a sample.}
#' \item{countsB}{The expression matrix of group B. Each row represents a gene and each column represents a sample.}
#'
#' @export
make_DEG_data2 <- function(n.genes, n.samples.A, n.samples.B, exp.mean = 8, exp.sd = 2, dispersion = NULL, size.factor.sd = 0.1, ...)
{
# simulate DEG and heterogeneity
deg <- make_DEG_pattern(n.genes, n.samples.B, ...)
# simulate expression mean
mu0 <- 2^stats::rnorm(n.genes, exp.mean, exp.sd)
if (is.null(dispersion)) dispersion <- 4/mu0 + 0.1
# simulate group A
sfA <- 2^stats::rnorm(n.samples.A, sd = size.factor.sd)
countsA <- matrix(stats::rnbinom(n.genes * n.samples.A, mu = mu0 %*% t(sfA), size = 1/dispersion), nrow = n.genes)
# simulate group B
sfB <- 2^stats::rnorm(n.samples.B, sd = size.factor.sd)
countsB <- matrix(stats::rnbinom(n.genes * n.samples.B, mu = mu0 %*% t(sfB) * deg$FC, size = 1/dispersion), nrow = n.genes)
return(list(DEG=deg, countsA=countsA, countsB=countsB, factorA=sfA, factorB=sfB))
}
#' Simulate dropout expression data
#'
#' Generate the expression data with desired dropout rate
#'
#' The dropout event is modelled by a logistic distribution such that the low expression genes have
#' higher probability of dropout. The expression value of genes in a sample are randomly set to zero
#' with probabilities associated with their true expression values until the desired dropout rate
#' for that sample is meet.
#'
#' @param counts expression matrix where each row is a gene and each column is a sample.
#' @param dropout.rate the desired average dropout rate of all samples.
#' @param dropout.rate.sd the desired standard deviation of dropout rate among samples.
#'
#' @return This function will return a list with the following components:
#' \item{counts}{The modified expression matrix with the same dimension as input \code{counts}.}
#' \item{original.counts}{The original input expression matrix.}
#' \item{dropout}{The binary matrix indicating where the dropout events happen.}
#'
#' @references Peter V. Kharchenko, Lev Silberstein, and David T. Scadden.
#' Bayesian approach to single-cell differential expression analysis.
#' Nature Methods, 11(7):740–742, 2014.
#'
#' @export
simulate_dropout <- function(counts, dropout.rate = 0, dropout.rate.sd = 0.1)
{
n.genes <- dim(counts)[1]
n.samples <- dim(counts)[2]
intercept <- stats::rnorm(n.samples, 1.5, 0.5)
slope <- stats::rnorm(n.samples, 3, 1)
d <- sapply(1:n.samples, function(i) 1 / (1 + exp(slope[i] * (log10(counts[, i]) - intercept[i]))))
r <- 2^stats::rnorm(n.samples, sd = dropout.rate.sd) * dropout.rate
r[r > 1] <- 1
r[r < 0] <- 0
dropout <- matrix(0, n.genes, n.samples)
for (i in 1:n.samples)
{
gi <- which(counts[, i] > 0)
ri <- round(r[i] * length(gi))
dropout[sample(gi, ri, prob = d[gi, i]), i] <- 1
}
new.counts <- counts * (1 - dropout)
list(counts = new.counts, original.counts = counts, dropout = dropout)
}
#' Simulate dropout expression data
#'
#' Generate the expression data with desired dropout rate range
#'
#' The dropout event is modelled by a logistic distribution such that the low expression genes have
#' higher probability of dropout. The expression value of genes in a sample are randomly set to zero
#' with probabilities associated with their true expression values until the desired dropout rate
#' for that sample is meet.
#'
#' @param counts expression matrix where each row is a gene and each column is a sample.
#' @param min.rate the minimum dropout rate of all samples.
#' @param max.rate the maximum dropout rate of all samples.
#'
#' @return This function will return a list with the following components:
#' \item{counts}{The modified expression matrix with the same dimension as input \code{counts}.}
#' \item{original.counts}{The original input expression matrix.}
#' \item{dropout}{The binary matrix indicating where the dropout events happen.}
#'
#' @references Peter V. Kharchenko, Lev Silberstein, and David T. Scadden.
#' Bayesian approach to single-cell differential expression analysis.
#' Nature Methods, 11(7):740–742, 2014.
#'
#' @export
simulate_dropout2 <- function(counts, min.rate = 0, max.rate = 0.8)
{
n.genes <- dim(counts)[1]
n.samples <- dim(counts)[2]
intercept <- stats::rnorm(n.samples, 1.5, 0.5)
slope <- stats::rnorm(n.samples, 3, 1)
d <- sapply(1:n.samples, function(i) 1 / (1 + exp(slope[i] * (log10(counts[, i]) - intercept[i]))))
r <- stats::runif(n.samples, min.rate, max.rate)
r[r > 1] <- 1
r[r < 0] <- 0
dropout <- matrix(0, n.genes, n.samples)
for (i in 1:n.samples)
{
gi <- which(counts[, i] > 0)
ri <- round(r[i] * length(gi))
dropout[sample(gi, ri, prob = d[gi, i]), i] <- 1
}
new.counts <- counts * (1 - dropout)
list(counts = new.counts, original.counts = counts, dropout = dropout)
}
#' Simulate sample groups from given samples with labels
#'
#' Generate sample groups with desired labels and sizes from given sample labels.
#'
#' @param labels a vector containing the label of each sample in the pool.
#' @param groups a vector containing the desired label of samples in each group.
#' The label must be available in the sample pool provided by \code{labels}.
#' @param sizes integer vector indicating the desired number of samples in each group.
#' The length must be either one or the same as \code{groups}.
#' @param replace logical variable indicating whether sampling is with replacement.
#'
#' @return This function will return a list with the same length as \code{groups}.
#' Each component is a vector containing the indexes of samples that are sampled for
#' the corresponding group.
#'
#' @export
simulate_sample_groups <- function(labels, groups, sizes, replace = FALSE)
{
if (sum(groups %in% labels) < length(groups)) stop("The elements in 'groups' must be available labels!")
if (length(sizes) != length(groups))
{
if (length(sizes) == 1) sizes <- rep_len(sizes, length(groups))
else stop("The lengths of 'groups' and 'sizes' must be equal!")
}
if (replace)
{
sample.groups <- lapply(1:length(groups), function(i) sample(which(labels == groups[i]), sizes[i], replace = TRUE))
}
else
{
u.groups <- unique(groups)
u.sizes <- sapply(u.groups, function(id) sum(sizes[groups == id]))
total.sizes <- sapply(u.groups, function(id) sum(labels == id))
if (any(u.sizes > total.sizes)) stop("The total sampling size is too larger when 'replace = FALSE'!")
u.samples <- lapply(1:length(u.groups), function(i) sample(which(labels == u.groups[i]), u.sizes[i], replace = FALSE))
sample.groups <- list()
for (k in 1:length(u.groups))
{
k.groups <- groups == u.groups[k]
k.id <- 1:sum(k.groups)
partition <- sample(rep(k.id, times = sizes[k.groups]), u.sizes[k])
sample.groups[k.groups] <- lapply(k.id, function(i) u.samples[[k]][partition == i])
}
}
names(sample.groups) <- groups
sample.groups
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.