#' Synthetic reads generator for genetic variants
#'
#' There are following steps to generate the simulated reads counts for variants
#' in single cells:
#' 1) given the clonal genotype and the clonal prevalence, the genotypes (i.e,
#' the clone) of cells will be generated following a multinomial distribution.
#' Note, one cell may contain variants from two clones when it is a doublet.
#' 2) given the distribution of reads coverage, e.g., a matrix of read coverage
#' from real data, (variant specific), the total reads of each variant will be
#' generated by random sampling. Note, the missing rate is governed by this
#' matrix.
#' 3) the allelic frequency of each variant will be generated by following a
#' beta distribution with parameters of mean and variance.
#' 4) Given the genotype of a cell, if the mutation exists in a cell, the
#' alteration read counts will be generated by a binomial distribution,
#' parameterized the allelic frequency, sampled from step 3.
#' 5) Given the genotype of a cell, if the mutation does not exist in a cell,
#' the alteration read counts will be generated by a binomial distribution,
#' parameterized by the technical error rate.
#'
#' @export
#'
#' @param Config A matrix of binary values. The clone-variant configuration,
#' which encodes the phylogenetic tree structure, and the genotype of each clone
#' @param D A matrix of integers. Sequencing depth for N variants across x
#' cells (ideally >100 cells). NA means 0 here.
#' @param Psi A vector of float. The fractions of each clone. If NULL, set a
#' uniform distribution.
#' @param means A vector of two floats. The mean theta_1 (false positive rate)
#' and the mean theta_2 (true positive rate).
#' @param vars A vector of two floats. The variance of theta_1 and theta_2.
#' @param cell_num A integer. The number of cells to generate.
#' @param permute_D A Boolean value. If True permute variants in D.
#' @param wise0 A string, the beta-binomial parameter specificity for theta0:
#' global, variant, element.
#' @param wise1 A string, the beta-binomial parameter specificity for theta1:
#' global, variant, element.
#' @param doublet A float between 0 and 1, the rate of doublets
#' @param sample_cell A Boolean value. If True and M > ncol(D), sample cells.
#'
#' @return a list containing \code{A_sim}, a matrix for alteration reads,
#' \code{A_sim}, a matrix for total reads, \code{I_sim}, a matrix for clonal
#' label, \code{H_sim}, a matrix for genotype, \code{theta0}, a matrix of
#' expected false positive rate, \code{theta1}, a matrix of expected true
#' positive rate, \code{theta0_binom}, theta0 as binomial parameter,
#' \code{theta1_binom}, theta0 as binomial parameter, and \code{is_doublet}, a
#' vector of Boolean value if a cell is a doublet
#'
#' @examples
#' data(simulation_input)
#' D2 <- sample_seq_depth(D_input, n_cells = 500, n_sites = nrow(tree_4clone$Z))
#' simu <- sim_read_count(tree_4clone$Z, D2, Psi = NULL, cell_num = 500)
sim_read_count <- function(Config, D, Psi = NULL,
means = c(0.002, 0.45), vars = c(100, 1),
wise0 = "element", wise1 = "variant", cell_num = 300,
permute_D = FALSE, sample_cell = TRUE, doublet = 0.0) {
M <- cell_num # number of cells
K <- ncol(Config) # number of clones
N <- nrow(Config) # number of variants
shape1s <- means * vars
shape2s <- (1.0 - means) * vars
D[which(D == 0)] <- NA
if (permute_D == TRUE) {
D <- D[sample(nrow(D)), ]
}
if (sample_cell || M > ncol(D)) {
D_sim <- D[, sample(ncol(D), M, replace = TRUE)]
} else {
D_sim <- D[, seq_len(M)]
}
# genotype for cells H_sim, and clone labels I_sim
if (is.null(Psi)) {
Psi <- rep(1 / K, K)
}
is_doublet <- rep(FALSE, M)
is_doublet[sample(M, round(doublet * M))] <- TRUE
I_sim <- rmultinom(M, 1, prob = Psi) # Clonal label for each cell
I_sim[, is_doublet] <- rmultinom(sum(is_doublet), 2, prob = Psi)
I_sim[which(I_sim > 1)] <- 1
H_sim <- Config %*% I_sim # Genotype for each cell
# generate p, D and A
p_sim <- matrix(0, 2) # p1 and p2 for False positive and True positive
A_sim <- matrix(NA, N, M) # Alteration counts matrix
A_germ_sim <- matrix(NA, N, M) # Alteration counts matrix for germline var
D_germ_sim <- matrix(NA, N, M)
theta0_Bern_sim <- matrix(NA, N, M) # theta0 matrix
theta1_Bern_sim <- matrix(NA, N, M) # theta1 matrix
theta0_binom_sim <- matrix(NA, N, M) # theta0 matrix
theta1_binom_sim <- matrix(NA, N, M) # theta1 matrix
p_sim[1] <- stats::rbeta(1, shape1s[1], shape2s[1])
p_sim[2] <- stats::rbeta(1, shape1s[2], shape2s[2])
for (i in seq_len(N)) {
if (wise0 == "variant") {
p_sim[1] <- stats::rbeta(1, shape1s[1], shape2s[1])
}
if (wise1 == "variant") {
p_sim[2] <- stats::rbeta(1, shape1s[2], shape2s[2])
}
for (j in seq_len(M)) {
if (wise0 == "element") {
p_sim[1] <- stats::rbeta(1, shape1s[1], shape2s[1])
}
if (wise1 == "element") {
p_sim[2] <- stats::rbeta(1, shape1s[2], shape2s[2])
}
if (!is.na(D_sim[i, j])) {
D_germ_sim[i, j] <- D_sim[i, j]
A_germ_sim[i, j] <- rbinom(1, D_germ_sim[i, j], p_sim[2])
theta0_binom_sim[i, j] <- p_sim[1]
theta1_binom_sim[i, j] <- p_sim[2]
theta0_Bern_sim[i, j] <- 1 - dbinom(0,
size = D_sim[i, j],
prob = p_sim[1]
)
theta1_Bern_sim[i, j] <- 1 - dbinom(0,
size = D_sim[i, j],
prob = p_sim[2]
)
if (H_sim[i, j] == 0) {
A_sim[i, j] <- rbinom(1, D_sim[i, j], p_sim[1])
} else {
A_sim[i, j] <- rbinom(1, D_sim[i, j], p_sim[2])
}
}
}
}
I_sim <- t(I_sim)
colnames(I_sim) <- colnames(Config)
row.names(A_sim) <- row.names(D_sim) <- row.names(Config)
row.names(A_germ_sim) <- row.names(D_germ_sim) <- row.names(Config)
cell_names <- paste0("cell", seq_len(ncol(A_sim)))
row.names(I_sim) <- colnames(A_sim) <- colnames(D_sim) <- cell_names
colnames(A_germ_sim) <- colnames(D_germ_sim) <- cell_names
return_list <- list(
"H_sim" = H_sim, "I_sim" = I_sim,
"A_sim" = A_sim, "D_sim" = D_sim,
"A_germ_sim" = A_germ_sim, "D_germ_sim" = D_germ_sim,
"theta0" = theta0_Bern_sim, "theta1" = theta1_Bern_sim,
"theta0_binom" = theta0_binom_sim,
"theta1_binom" = theta1_binom_sim,
"is_doublet" = is_doublet
)
return_list
}
#' Update matrix D with manually selected missing rate
#'
#' Given missing rate, the NA will be generated first. For none NA element,
#' sequencing depth with uniformly sampled from D, row wisely. Namely, the depth
#' is variant specific.
#'
#' @param D A matrix (N variants x M cells), the original sequencing coverage,
#' NA means missing
#' @param n_cells A integer, the number of the cells to generate
#' @param n_sites A integer, the number of variants to generate
#' @param missing_rate A float value, if NULL, use the same missing rate as D
#' @return a n_sites by n_cells matrix sampled from input D.
#'
#' @export
#' @examples
#' data(simulation_input)
#' D1 <- sample_seq_depth(D_input,
#' n_cells = 500, n_sites = 50,
#' missing_rate = 0.85
#' )
sample_seq_depth <- function(D, n_cells = NULL, n_sites = NULL,
missing_rate = NULL) {
D[which(D == 0)] <- NA
if (is.null(n_cells)) {
n_cells <- ncol(D)
}
if (is.null(n_sites)) {
n_sites <- nrow(D)
}
missing_vector <- rowMeans(is.na(D))
if (!is.null(missing_rate)) {
if (missing_rate >= 1.0) {
stop("missing rate should be < 1.0!")
}
m_upbound <- 1 - 1.0 / n_cells
m_mean <- mean(missing_vector)
amplify <- min(
missing_rate / m_mean,
(m_upbound - missing_rate) / (max(missing_vector) - m_mean)
)
missing_vector <- amplify * (missing_vector - m_mean) + missing_rate
}
idx <- sample(nrow(D), size = n_sites, replace = (n_sites > nrow(D)))
D_input <- D[idx, ]
missing_vector <- missing_vector[idx]
D_output <- matrix(NA, nrow = n_sites, ncol = n_cells)
for (i in seq_len(nrow(D_input))) {
D_no_na <- D_input[i, which(!is.na(D_input[i, ]) & D_input[i, ] > 0)]
D_output[i, ] <- sample(D_no_na, n_cells, replace = TRUE)
missing_idx <- sample(n_cells, round(missing_vector[i] * n_cells),
replace = FALSE
)
D_output[i, missing_idx] <- NA
}
D_output
}
#' Down sample number of SNVs in the tree
#'
#' @param tree A tree object from Canopy
#' @param n_SNV A integer, the number of SNVs to keep in the output tree
#' @return a phylo tree with down sampled variants
#'
#' @export
#' @examples
#' data(simulation_input)
#' tree_lite <- sample_tree_SNV(tree_4clone, n_SNV = 10)
sample_tree_SNV <- function(tree, n_SNV = NULL) {
total_SNV <- nrow(tree$Z)
if (is.null(n_SNV)) {
n_SNV <- total_SNV
}
if (total_SNV < n_SNV) {
stop("Only support down sample SNVs")
}
idx <- sample(total_SNV, size = n_SNV)
out.tree <- tree
out.tree$Z <- out.tree$Z[idx, ]
out.tree$CCF <- out.tree$CCF[idx, , drop = FALSE]
out.tree$VAF <- out.tree$VAF[idx, , drop = FALSE]
out.tree$sna <- out.tree$sna[idx, , drop = FALSE]
out.tree
}
#' Reads simulator for donor identification
#' @param GT Variant-by-donor matrix for genotypes
#' @param D_seed Variant-by-cell matrix for read coverage for generating depth,
#' which be row sample and column sample both with replacement
#' @param sample_variants logical(1), if TRUE, sample variants with replacement
#' to the same size, otherwise not
#' @param donor_size Vector of float for the fractions of each donor; default
#' NULL means uniform
#' @param beta_shapes A 3-by-2 matrix of beta parameters for genotypes: 0, 1,
#' and 2; default NULL means matrix(c(0.2, 0.5, 99.8, 99.8, 0.5, 0.2), nrow = 3)
#' @param n_cell An integer for number of total cells
#' @param doublet_rate A float from 0 to 1 for doublet rate; default NULL means
#' rate n_cell / 100000
#'
#' @return A list of various components of the simulated dataset.
#' @export
#'
donor_read_simulator <- function(GT, D_seed, sample_variants = FALSE,
donor_size = NULL, beta_shapes = NULL,
n_cell = 5000, doublet_rate = NULL) {
K <- ncol(GT) # number of clones
N <- nrow(GT) # number of variants
if (is.null(beta_shapes)) {
# beta_shapes <- matrix(c(0.3, 3, 29.7, 29.7, 3, 0.3), nrow = 3)
beta_shapes <- matrix(c(0.2, 0.1, 99.8, 99.8, 0.1, 0.2), nrow = 3)
}
if (is.null(doublet_rate)) {
doublet_rate <- n_cell / 100000
print(paste("doublet rate:", doublet_rate))
}
if (is.null(donor_size)) {
donor_size <- rep(1 / K, K)
}
## generate donor id matrix
n_doublet <- round(doublet_rate * n_cell)
I_sim <- t(rmultinom(n_cell + n_doublet, 1, prob = donor_size))
## generate D
if (sample_variants) {
D_sim <- D_seed[sample(nrow(D_seed), N, replace = TRUE), ]
} else {
D_sim <- D_seed
}
D_sim <- D_sim[, sample(ncol(D_sim), nrow(I_sim), replace = TRUE)]
D_sim <- Matrix::Matrix(D_sim, sparse = TRUE)
## generate A with beta binomial
A_sim <- D_sim
H_sim <- GT %*% t(I_sim)
for (ii in seq_len(3)) {
val2 <- beta_shapes[ii, 1] + beta_shapes[ii, 2]
val1 <- beta_shapes[ii, 1] / val2
idx <- which(as.matrix(D_sim) > 0 & H_sim == (ii - 1))
A_sim[idx] <- VGAM::rbetabinom(length(idx),
size = D_sim[idx],
prob = val1, rho = 1 / (1 + val2)
)
}
## pooling doublet
idx_doublet1 <- sample(n_cell, n_doublet)
idx_doublet2 <- seq(n_cell + 1, n_cell + n_doublet)
A_sim[, idx_doublet1] <- A_sim[, idx_doublet1] + A_sim[, idx_doublet2]
D_sim[, idx_doublet1] <- D_sim[, idx_doublet1] + D_sim[, idx_doublet2]
I_sim[idx_doublet1, ] <- I_sim[idx_doublet1, ] + I_sim[idx_doublet2, ]
A_sim <- A_sim[, seq_len(n_cell)]
D_sim <- D_sim[, seq_len(n_cell)]
I_sim <- I_sim[seq_len(n_cell), ]
## output
colnames(I_sim) <- colnames(GT)
row.names(A_sim) <- row.names(D_sim) <- row.names(GT)
cell_names <- paste0("cell", seq_len(ncol(A_sim)))
row.names(I_sim) <- colnames(A_sim) <- colnames(D_sim) <- cell_names
return_list <- list(
"A" = A_sim, "D" = D_sim,
"I_sim" = I_sim, "GT" = GT
)
return_list
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.