#' Simulate read count (single SNP model)
#'
#' Given gene configuration, genotype (single SNP), genetic effect size (log aFC),
#' and other parameters, simulate read count.
#'
#' @section About sampling Y^h:
#' To specify read count of each haplotype (Y^h) given library size and relative abundance.
#' It is set in y_dist.
#' Currently it supports three distribution types: 'poisson', 'lognormal', and 'negbinom'.
#' Specifically, y_dist is a list with distribution name in 'type'.
#' For y_dist$type = 'poisson', no other parameter is required.
#' For y_dist$type = 'lognormal', 'sigma' needs to specify, and
#' Y^h = round(library_size * relative_abundance * rlnorm(1, 0, y_dist$sigma))
#' For y_dist$type = 'negbinom', 'prob' and 'size_factor' need to specify, and
#' Y^h = rnbinom(1, size = y_dist$size_factor * library_size * relative_abundance, prob = y_dist$prob)
#'
#' @param gene gene instance generated from \code{\link{create_gene}}
#' @param genotype genotype generated from \code{\link{create_genotype}} (the 'genotype' object in the list)
#' @param beta log(aFC)
#' @param L_read length of read
#' @param y_dist distribution of Y^h | library size, relative abundance.
#' It specifies the shape of the distribution whereas
#' the mean is given by library size and relative abundance.
#'
#' @return read counts, where observed count include y1, y2 (AS count of each haplotypes) and
#' ystar (total count - y1 - y2) along with library size Ti_lib.
#' Also, it includes unobserved count as 'hidden' (y1star and y2star).
#'
#' @examples
#' gene = create_gene()
#' genotype = create_genotype(
#' maf = c(0.05, 0.3),
#' nsample = 300,
#' nreplicate = 1000
#' )
#' simulate_read_count(
#' gene = gene,
#' genotype = genotype[[1]]$genotype,
#' beta = 0.2,
#' L_read = 75,
#' y_dist = list(
#' type = 'negbinom',
#' prob = 2/3,
#' size_factor = 2
#' )
#' )
#'
#' @export
#' @importFrom stats rbeta rlnorm rnbinom rpois
simulate_read_count = function(gene, genotype, beta, L_read, y_dist) {
## gene: generated by generate_gene_instance
## beta: log(fold change)
## L_read: length of read
## genotype: N x 2 genotype data.frame (N: sample size)
N = nrow(genotype)
observed = list()
hidden = list()
ase_bysite = list()
for(i in 1 : N) {
if(gene$theta$type == 'beta') {
theta_i = rbeta(1, gene$theta$alpha, gene$theta$beta)
} else if(gene$theta$type == 'lognormal') {
theta_i = rlnorm(1, gene$theta$k, sqrt(gene$theta$sigma))
}
if(gene$library_dist$type == 'poisson') {
T_lib = rpois(1, gene$library_dist$lambda)
} else if(gene$library_dist$type == 'negbinom'){
T_lib = rnbinom(1, prob = gene$library_dist$prob, size = gene$library_dist$size)
}
Gi = as.numeric(genotype[i, ])
theta_prime1 = exp(beta * Gi[1]) * theta_i
theta_prime2 = exp(beta * Gi[2]) * theta_i
if(y_dist$type == 'poisson') {
Ti1 = rpois(1, T_lib * theta_prime1)
Ti2 = rpois(1, T_lib * theta_prime2)
} else if(y_dist$type == 'lognormal') {
Ti1 = round(T_lib * theta_prime1 * rlnorm(1, 0, y_dist$sigma))
Ti2 = round(T_lib * theta_prime2 * rlnorm(1, 0, y_dist$sigma))
} else if(y_dist$type == 'negbinom') {
Ti1 = rnbinom(1, size = y_dist$size_factor * T_lib * theta_prime1, prob = eval(parse(text = y_dist$prob)))
Ti2 = rnbinom(1, size = y_dist$size_factor * T_lib * theta_prime2, prob = eval(parse(text = y_dist$prob)))
}
P1 = sample(1 : gene$L_gene, Ti1, replace = T, prob = gene$vec_pos)
P2 = sample(1 : gene$L_gene, Ti2, replace = T, prob = gene$vec_pos)
Zij = rbinom(length(gene$fj), 1, 2 * gene$fj * (1 - gene$fj))
temp = read2data(P1, P2, gene$S, Zij, L_read)
observed_i = temp$observed
observed_i$Gi1 = Gi[1]
observed_i$Gi2 = Gi[2]
observed_i$Ti_lib = T_lib
observed_i$Y1 = Ti1
observed_i$Y2 = Ti2
observed[[length(observed) + 1]] = observed_i
hidden[[length(hidden) + 1]] = temp$hidden
ase_bysite[[length(ase_bysite) + 1]] = temp$ase_bysite
}
observed = do.call(rbind, observed)
hidden = do.call(rbind, hidden)
return(list(observed = observed, hidden = hidden, ase_bysite = ase_bysite))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.