mixqtl: Combine total and allele-specific counts for cis-QTL analysis

Documented in simulate_read_count

#' Simulate read count (single SNP model)
#'
#' Given gene configuration, genotype (single SNP), genetic effect size (log aFC),
#' and other parameters, simulate read count.
#'
#' @section About sampling Y^h:
#' To specify read count of each haplotype (Y^h) given library size and relative abundance.
#' It is set in y_dist.
#' Currently it supports three distribution types: 'poisson', 'lognormal', and 'negbinom'.
#' Specifically, y_dist is a list with distribution name in 'type'.
#' For y_dist$type = 'poisson', no other parameter is required.
#' For y_dist$type = 'lognormal', 'sigma' needs to specify, and
#' Y^h = round(library_size * relative_abundance * rlnorm(1, 0, y_dist$sigma))
#' For y_dist$type = 'negbinom', 'prob' and 'size_factor' need to specify, and
#' Y^h = rnbinom(1, size = y_dist$size_factor * library_size * relative_abundance, prob = y_dist$prob)
#'
#' @param gene gene instance generated from \code{\link{create_gene}}
#' @param genotype genotype generated from \code{\link{create_genotype}} (the 'genotype' object in the list)
#' @param beta log(aFC)
#' @param L_read length of read
#' @param y_dist distribution of Y^h | library size, relative abundance.
#'   It specifies the shape of the distribution whereas
#'   the mean is given by library size and relative abundance.
#'
#' @return read counts, where observed count include y1, y2 (AS count of each haplotypes) and
#' ystar (total count - y1 - y2) along with library size Ti_lib.
#' Also, it includes unobserved count as 'hidden' (y1star and y2star).
#'
#' @examples
#' gene = create_gene()
#' genotype = create_genotype(
#'   maf = c(0.05, 0.3),
#'   nsample = 300,
#'   nreplicate = 1000
#' )
#' simulate_read_count(
#'   gene = gene,
#'   genotype = genotype[[1]]$genotype,
#'   beta = 0.2,
#'   L_read = 75,
#'   y_dist = list(
#'     type = 'negbinom',
#'     prob = 2/3,
#'     size_factor = 2
#'   )
#' )
#'
#' @export
#' @importFrom stats rbeta rlnorm rnbinom rpois
simulate_read_count = function(gene, genotype, beta, L_read, y_dist) {
  ## gene: generated by generate_gene_instance
  ## beta: log(fold change)
  ## L_read: length of read
  ## genotype: N x 2 genotype data.frame (N: sample size)

  N = nrow(genotype)
  observed = list()
  hidden = list()
  ase_bysite = list()
  for(i in 1 : N) {
    if(gene$theta$type == 'beta') {
      theta_i = rbeta(1, gene$theta$alpha, gene$theta$beta)
    } else if(gene$theta$type == 'lognormal') {
      theta_i = rlnorm(1, gene$theta$k, sqrt(gene$theta$sigma))
    }
    if(gene$library_dist$type == 'poisson') {
      T_lib = rpois(1, gene$library_dist$lambda)
    } else if(gene$library_dist$type == 'negbinom'){
      T_lib = rnbinom(1, prob = gene$library_dist$prob, size = gene$library_dist$size)
    }

    Gi = as.numeric(genotype[i, ])
    theta_prime1 = exp(beta * Gi[1]) * theta_i
    theta_prime2 = exp(beta * Gi[2]) * theta_i
    if(y_dist$type == 'poisson') {
      Ti1 = rpois(1, T_lib * theta_prime1)
      Ti2 = rpois(1, T_lib * theta_prime2)
    } else if(y_dist$type == 'lognormal') {
      Ti1 = round(T_lib * theta_prime1 * rlnorm(1, 0, y_dist$sigma))
      Ti2 = round(T_lib * theta_prime2 * rlnorm(1, 0, y_dist$sigma))
    } else if(y_dist$type == 'negbinom') {
      Ti1 = rnbinom(1, size = y_dist$size_factor * T_lib * theta_prime1, prob = eval(parse(text = y_dist$prob)))
      Ti2 = rnbinom(1, size = y_dist$size_factor * T_lib * theta_prime2, prob = eval(parse(text = y_dist$prob)))
    }

    P1 = sample(1 : gene$L_gene, Ti1, replace = T, prob = gene$vec_pos)
    P2 = sample(1 : gene$L_gene, Ti2, replace = T, prob = gene$vec_pos)
    Zij = rbinom(length(gene$fj), 1, 2 * gene$fj * (1 - gene$fj))
    temp = read2data(P1, P2, gene$S, Zij, L_read)
    observed_i = temp$observed
    observed_i$Gi1 = Gi[1]
    observed_i$Gi2 = Gi[2]
    observed_i$Ti_lib = T_lib

    observed_i$Y1 = Ti1
    observed_i$Y2 = Ti2
    observed[[length(observed) + 1]] = observed_i
    hidden[[length(hidden) + 1]] = temp$hidden
    ase_bysite[[length(ase_bysite) + 1]] = temp$ase_bysite
  }
  observed = do.call(rbind, observed)
  hidden = do.call(rbind, hidden)
  return(list(observed = observed, hidden = hidden, ase_bysite = ase_bysite))
}