#' Simulate sequencing data obtained from the alphabetr approach with a specified clonal structure
#'
#' \code{create_data()} simulates an alphabetr sequencing experiment by sampling
#' clones from a clonal structure specified by the user. The clones are
#' placed on a frequency distribution where a fixed number clones represents
#' the top proportion of the population in frequency and the other clones
#' represent the rest of the population in frequency. Different error models
#' and different sampling strategies can be simulated as well.
#'
#' @param TCR The specified clonal structure, which can be created from
#' \code{\link{create_clones}}.
#' @param plates The number of plates of data.
#' @param error_drop A vector of length 2 with the mean of the drop error rate
#' and the sd of the drop error rate.
#' @param error_seq A vector of length 2 with the mean of the in-frame error
#' rate and the sd of the in-frame error rate.
#' @param error_mode A vector of two strings determining the "mode" of the error
#' models. The first element sets the mode of the drop errors, and the second
#' element sets the mode of the in-frame errors. The two modes available are
#' "constant" for a constant error rate and "lognormal" for error rates
#' drawn from a lognormal distribution. If the mode is set to "constant" the
#' sd specified in \code{error_drop} and/or \code{error_seq} will be ignored.
#' @param skewed Number of clones represent the top proportion of the population
#' by frequency (which is specified by \code{prop_top}).
#' @param prop_top The proportion of the population in frequency represented by
#' the number of clones specified by \code{skewed}.
#' @param dist The distribution of frequency of the top clones. Currently only
#' "linear" is available.
#' @param numb_cells A two column matrix determining the sampling strategy of
#' the experiment. The first column represents the number of cells per well,
#' and the second column represents the number of wells with that sample
#' size. The sum of column 2 should equal 96 times the number of plates.
#'
#' @return A list of length 2. The first element is a matrix representing the
#' data of the alpha chains, and the second element is a matrix representing
#' the data of beta chains. The matrix represents the sequencing data by
#' representing the wells of the data by rows and the chain indices by
#' column. Entry [i, j] of the matrix represents if chain j is found in
#' well i (yes == 1, no == 0). e.g. if alpha chain 25 is found in well 10,
#' then [10, 25] of the alpha matrix will be 1.
#'
#' @examples
#' # see the help for create_clones() for details of this function call
#' clones <- create_clones(numb_beta = 1000,
#' dual_alpha = .3,
#' dual_beta = .06,
#' alpha_sharing = c(0.80, 0.15, 0.05),
#' beta_sharing = c(0.75, 0.20, 0.05))
#'
#' # creating a data set with 5 plates, lognormal error rates, 10 clones
#' # making up the top 60% of the population in frequency, and a constant
#' # sampling strategy of 50 cells per well for 480 wells (five 96-well plates)
#' dat <- create_data(clones$TCR, plate = 5,
#' error_drop = c(.15, .01),
#' error_seq = c(.05, .001),
#' error_mode = c("lognormal", "lognormal"),
#' skewed = 10,
#' prop_top = 0.6,
#' dist = "linear",
#' numb_cells = matrix(c(50, 480), ncol = 2))
#'
#' @export
create_data <- function(TCR, plates,
error_drop,
error_seq,
error_mode,
skewed,
prop_top,
dist = "linear",
numb_cells){
wells <- 96 * plates # 96 well plates
# collecting some parameters
numb_clones <- nrow(TCR) # number of clones
numb_alph <- max(TCR[, 3:4]) # number of unique alpha chains in the population
numb_beta <- max(TCR[, 1:2]) # number of unique beta chains in the population
# creating the skewed distribution from which the clones will be sampled; the first number.skewed clones will comprise 50% of the population, the rest of the clones the other 50%
# The sampling is done by sampling indices from a skewed distribution, and then choosing the corresponding clone/row from the input data; the input data should already have the clones in random order
# linear distribution: first n clones repesent 50%, p_k = p0 - k * step
if (dist == "linear") {
last_term <- (1 - prop_top) / (numb_clones - skewed) * 1.1
lhs11 <- 1
lhs12 <- skewed - 1
lhs21 <- (1 + skewed - 1)
lhs22 <- (skewed - 1) / 2 + (skewed - 1) ^ 2 / 2
A <- matrix(c(lhs11, lhs21, lhs12, lhs22), nrow = 2)
b <- matrix(c(last_term, prop_top), nrow = 2)
solveAb <- solve(A,b)
prob1 <- solveAb[1] + 0:(skewed - 1) * solveAb[2]
prob2 <- rep((1 - prop_top)/(numb_clones - skewed), numb_clones - skewed)
dist_vector <- c(prob1, prob2)
}
#------------- error model ----------------#
# Create columns to attach to TCR (the clones matrix) for the errors in the exp
# col 5 is be the drop rate (i.e. the rate at which the chain won't show up at all)
# col 6 is be the substitution sequencing rates (i.e. rate of false in frame seq)
# col 7 is the number of different possible substitute sequences
# drop rates: can be a constant drop rate or drop rates distributed on LN dist
if (error_mode[1] == "constant") {
err_drop <- error_drop[1]
TCR_drop <- cbind(TCR, err_drop)
} else if (error_mode[1] == "lognormal") {
if (length(error_drop) != 2) stop("The error_drop argument must be length 2.")
# input mean and sd of the error drop rates
err_mean <- error_drop[1]
err_sd <- error_drop[2]
# need to calculate the input parameters of the association normal dist; see vign
mu <- log(err_mean) - log((err_sd / err_mean) ^ 2 + 1) / 2
sd <- sqrt(log((err_sd / err_mean) ^ 2 + 1))
# log of 0 is -Inf, so need to manually specify when err_mean is 0
if (err_mean == 0) {
mu <- 0
sd <- 0
}
# sample drop rates from lognormal distribution; ensure none are greater than 90%
drop_vec <- stats::rlnorm(numb_clones, meanlog = mu, sdlog = sd)
while (any(drop_vec > .9)) {
drop_vec <- stats::rlnorm(numb_clones, meanlog = mu, sdlog = sd)
}
TCR_drop <- cbind(TCR, drop_vec)
} else {
stop("Invalid error model specification. Choose either 'constant' or 'lognormal'")
}
# in-frame error rates: can be constant or occur on a distribution
if (error_mode[2] == "constant") {
# when constant, all chains have same rate of being sequenced as one of two distinct erroneous sequences
err_seq_alph <- rep(error_seq[1], numb_alph)
err_seq_beta <- rep(error_seq[1], numb_beta)
err_num_alph <- rep(2, numb_alph)
err_num_beta <- rep(2, numb_beta)
# if the err rate is 0, then there are no false in frame chains
if (error_seq[1] != 0) {
# summing up the number of false in-frame sequences
numb_false_alph <- sum(err_num_alph)
numb_false_beta <- sum(err_num_beta)
# lists record the erroneous chains associated with each chain
false_alph <- vector(mode = "list", length = numb_false_alph)
false_beta <- vector(mode = "list", length = numb_false_beta)
for (i in 1:numb_false_alph) {
false_alph[[i]] <- numb_alph + (2 * i - 1):(2 * i)
}
for (i in 1:numb_false_beta) {
false_beta[[i]] <- numb_beta + (2 * i - 1):(2 * i)
}
} else {
# when error rate is 0, set everything to 0
numb_false_alph <- 0
numb_false_beta <- 0
# lists are empty as well
false_alph <- vector(mode = "list", length = numb_false_alph)
false_beta <- vector(mode = "list", length = numb_false_beta)
}
} else if (error_mode[2] == "lognormal") {
# input mean and sd of the error drop rates
err_mean <- error_seq[1]
err_sd <- error_seq[2]
# if err_mean is 0, then no need to bother with this
# if err_mean > 0, then we give error rates and # of erroneous in frame seqs
if (err_mean != 0) {
# need to calculate the input parameters of the association normal dist; see vign
mu <- log(err_mean) - log((err_sd / err_mean) ^ 2 + 1) / 2
sd <- sqrt(log((err_sd / err_mean) ^ 2 + 1))
# sample error rates from LN dist; each chain can be erroneously
# sequenced into 1-4 distinct different wrong chains
err_seq_alph <- stats::rlnorm(numb_alph, meanlog = mu, sdlog = sd)
err_seq_beta <- stats::rlnorm(numb_beta, meanlog = mu, sdlog = sd)
# err_num_alph <- sample(1:4, size = numb_alph, replace = TRUE)
# err_num_beta <- sample(1:4, size = numb_beta, replace = TRUE)
err_num_alph <- rep(3, numb_alph)
err_num_beta <- rep(3, numb_beta)
# number of false chains
numb_false_alph <- sum(err_num_alph)
numb_false_beta <- sum(err_num_beta)
# recording which errorenous chains are associated with which true chains
false_alph <- vector(mode = "list", length = numb_alph)
false_beta <- vector(mode = "list", length = numb_beta)
ind <- 1
for (i in 1:numb_alph) {
false_alph[[i]] <- numb_alph + ind:(ind + err_num_alph[i] - 1)
ind <- ind + err_num_alph[i]
}
ind <- 1
for (i in 1:numb_beta) {
false_beta[[i]] <- numb_beta + ind:(ind + err_num_beta[i] - 1)
ind <- ind + err_num_beta[i]
}
} else { # when err_mean == 0
# no false sequences, no false sequences associated with any chain
numb_false_alph <- 0
numb_false_beta <- 0
err_seq_alph <- rep(0, numb_alph) # stats::rlnorm(numb_alph, meanlog = mu, sdlog = sd)
err_seq_beta <- rep(0, numb_beta) # stats::rlnorm(numb_beta, meanlog = mu, sdlog = sd)
false_alph <- vector(mode = "list", length = numb_alph)
false_beta <- vector(mode = "list", length = numb_beta)
}
} else {
stop("Invalid error model specification. Choose either 'constant' or 'lognormal'")
}
#-----------------------------------#
# Creation of the fake data set #
#-----------------------------------#
# Create a matrix to represent which chains are present in which well
# Entry (i, j) = 0 if chain j is not present in well i, (i,j) = 1 if chain j
# is present in well i
data_alph <- matrix(0, nrow = wells, ncol = numb_alph + numb_false_alph)
data_beta <- matrix(0, nrow = wells, ncol = numb_beta + numb_false_beta)
distinct_ss <- numb_cells[, 1] # vector of the distinct sample sizes in well
ind_well <- 1 # index of well we will be sampling cells into
for (sampl in seq_along(distinct_ss)) {
sample_size <- distinct_ss[sampl] # number of cells in the wells
numb_wells <- numb_cells[sampl, 2] # number of wells with sample_size cells
for (n_well in 1:numb_wells) {
rand <- sample(numb_clones, size = sample_size, prob = dist_vector,
replace = TRUE)
samp_clones <- TCR_drop[rand, ]
samp_beta <- matrix(nrow = 2 * nrow(samp_clones), ncol = 2)
dual_beta <- which(samp_clones[, 1] != samp_clones[, 2])
ind <- 1
for (i in seq_len(nrow(samp_clones))) {
if (i %in% dual_beta) {
samp_beta[ind:(ind + 1), 1] <- samp_clones[i, 1:2]
samp_beta[ind:(ind + 1), 2] <- samp_clones[i, 5]
ind <- ind + 2
} else {
samp_beta[ind, 1] <- samp_clones[i, 1]
samp_beta[ind, 2] <- samp_clones[i, 5]
ind <- ind + 1
}
}
# col 2 of samp beta has the drop rate; beta_i isn't dropped when stats::runif > err
samp_beta <- samp_beta[!is.na(samp_beta[, 1]), ]
samp_beta <- samp_beta[stats::runif(nrow(samp_beta)) > samp_beta[, 2], 1]
samp_alph <- matrix(nrow = 2 * nrow(samp_clones), ncol = 2)
dual_alph <- which(samp_clones[, 3] != samp_clones[, 4])
ind <- 1
for (i in seq_len(nrow(samp_clones))) {
if (i %in% dual_alph) {
samp_alph[ind:(ind + 1), 1] <- samp_clones[i, 3:4]
samp_alph[ind:(ind + 1), 2] <- samp_clones[i, 5]
ind <- ind + 2
} else {
samp_alph[ind, 1] <- samp_clones[i, 3]
samp_alph[ind, 2] <- samp_clones[i, 5]
ind <- ind + 1
}
}
samp_alph <- samp_alph[!is.na(samp_alph[, 1]), ]
samp_alph <- samp_alph[stats::runif(nrow(samp_alph)) > samp_alph[, 2], 1]
switch_alph <- which(stats::runif(length(samp_alph)) < err_seq_alph[samp_alph])
for (a in switch_alph) {
ind_alph <- samp_alph[a]
samp_alph[a] <- sample(false_alph[[ind_alph]], size = 1)
}
switch_beta <- which(stats::runif(length(samp_beta)) < err_seq_beta[samp_beta])
for (b in switch_beta) {
ind_beta <- samp_beta[b]
samp_beta[b] <- sample(false_beta[[ind_beta]], size = 1)
}
data_alph[ind_well, samp_alph] <- 1
data_beta[ind_well, samp_beta] <- 1
# Internal QA controls
# sample_sizes[rand] <- sample_sizes[rand] + 1
# sample_in_wells[[ind_well]] <- rand
ind_well <- ind_well + 1
} # end for - wells
} # end for - sampl
#-----------------------------------#
# END Creation of the fake data set #
#-----------------------------------#
TCR <- TCR[order(TCR[, 1]), ]
list(alpha = data_alph, beta = data_beta)
}
# end main function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.