orient-bias-filter: Orientation Bias Filter

# OxoG artifact: G->T on read1 and C->A on read2
# FFPE artifact: C->T on read1 and T->A on read2

#' Simulate orientation bias read statistics.
#'
#' @param n   number of reads
#' @param theta  variant frequency
#' @param phi    damage frequency
#' @param e.mean       mean probability of read error
#' @param e.sd         standard deviation of the probability of read error
#' @export
simulate_orient_bias_read_stats <- function(n, theta, phi, e.mean, e.sd) {
	# 0: reference
	# 1: alternative
	# 2: third nucleotide
	# 3: fourth nucleotide
	nucleotides <- 0:3;
	ref <- 0;
	alt <- 1;

	es <- anti_phred(rnorm(n, mean=e.mean, sd=e.sd));

	probs <- c((1 - theta), theta);
	# original bases
	bases0 <- sample(nucleotides[1:2], n, replace=TRUE, prob=probs);

	# observed bases
	bases <- bases0;

	# whether read-pair orientation is consistent with artifact mechanism
	oriented <- sample(c(FALSE, TRUE), n, replace=TRUE);

	damages <- rep(FALSE, n);
	idx <- (bases == ref) & oriented;
	m <- sum(idx);
	damages[idx] <- sample(c(FALSE, TRUE), m, replace=TRUE, prob=c(1 - phi, phi));
	bases[damages] <- alt;

	# whether read-pair orientation and mutation is consistent with 
	# artifact mechanism
	#orients <- (sample(c(FALSE, TRUE), n, replace=TRUE) & bases == alt) | damages;

	d <- data.frame(
		base = bases,
		e = es,
		oriented = oriented,
		base0 = bases0,
		damage = damages
	);

	d
}

#' Simulate orientation bias read statistics based on a hierarchical model.
#'
#' Simulates data for multiple loci as determined by the size of `ns`,
#' and each locus is indexed by `j`.
#'
#' @param ns  number of reads at each locus
#' @param alpha_theta  hyperparmaeter for theta_j (variant frequency)
#' @param beta_theta   hyperparameter for theta_j (variant frequency)
#' @param alpha_phi    hyperparameter for phi_j (damage frequency)
#' @param beta_phi     hyperparameter for phi_j (damage frequency)
#' @param e.mean       mean probability of read error
#' @param e.sd         standard deviation of the probability of read error
#' @export
simulate_orient_bias_read_stats_hier <- function(ns, alpha_theta, beta_theta, alpha_phi, beta_phi, e.mean, e.sd) {
	m <- length(ns);

	thetas <- rbeta(m, alpha_theta, beta_theta);
	phis <- rbeta(m, alpha_phi, beta_phi);

	ys <- mapply(
		function(j, n, theta, phi) {
			s <- simulate_orient_bias_read_stats(n, theta, phi, e.mean, e.sd);
			d <- data.frame(
				locus = j,
				s,
				theta = theta,
				phi = phi
			);

			d
		},
		1:m, ns, thetas, phis,
		SIMPLIFY = FALSE
	);

	do.call(rbind, ys)
}


#' Simulate read counts for quantification model for orientation bias.
#' 
#' @param ns     number of reads at each locus
#' @param theta  variant frequency
#' @param phi    damage frequency
#' @export
simulate_orient_bias_read_counts <- function(ns, theta, phi) {
	# number of loci
	m <- length(ns);

	ni <- floor(ns / 2);
	nc <- ns - ni;

	xi <- rbinom(m, ni, theta);
	xr <- rbinom(m, nc, theta);
	xd <- rbinom(m, nc - xr, phi);

	xc <- xr + xd;

	data.frame(xc, xi, nc, ni, xr, xd)
}

#' Simulate read counts for quantification model based on a hierarchical
#' model for orientation bias.
#'
#' @param ns     number of reads each locus
#' @param alpha_theta  hyperparameter for theta (variant frequency)
#' @param beta_theta   hyperparameter for theta (variant frequency)
#' @param alpha_phi    hyperparameter for phi (damage frequency)
#' @param beta_phi     hyperparameter for phi (damage frequency)
#' @export
simulate_orient_bias_read_counts_hier <- function(ns, alpha_theta, beta_theta, alpha_phi, beta_phi) {
	# number of loci
	m <- length(ns);

	thetas <- rbeta(m, alpha_theta, beta_theta);
	phis <- rbeta(m, alpha_phi, beta_phi);

	d <- simulate_orient_bias_read_counts(ns, thetas, phis);
	d$theta <- thetas;
	d$phi <- phis;

	d
}