orient-bias-filter: Orientation Bias Filter

p_ref_base_given_params <- function(theta, phi, e) {
	     theta  * (e / 3) * (1 - phi) +
	(1 - theta) * (1 - e) * (1 - phi)	+
	     theta  * (1 - e) *      phi
}

p_alt_base_given_params <- function(theta, phi, e) {
	     theta  * (1 - e) * (1 - phi) +
	(1 - theta) * (e / 3) * (1 - phi)	+
	(1 - theta) * (1 - e) *      phi
}

p_other_base_given_params <- function(theta, phi, e) {
	# alt or ref mutated to other allele by sequencing error
	(e / 3) * (1 - phi) +
	# alt or ref mutated to other allele by damage
	(1 - e) *      phi
}

lp_ref_base_given_params <- function(theta, phi, e) {
	log_sum_exp(c(
		log(    theta) + log(e)-log(3) + log(1 - phi),
		log(1 - theta) + log(1 - e)    + log(1 - phi),
	  log(    theta) + log(1 - e)    + log(    phi)
	))
}

lp_alt_base_given_params <- function(theta, phi, e) {
	log_sum_exp(c(
		log(    theta) + log(1 - e)    + log(1 - phi),
		log(1 - theta) + log(e)-log(3) + log(1 - phi),
	  log(1 - theta) + log(1 - e)    + log(    phi)
	))
}

lp_other_base_given_params <- function(theta, phi, e) {
	log_sum_exp(c(
		# alt or ref mutated to other allele by sequencing error
		log(e)-log(3) + log(1 - phi),
		# alt or ref mutated to other allele by damage
		log(1 - e)    + log(    phi)
	))
}

lp_bases_given_params <- function(d, theta, phi) {
	sum(mapply(
		function(base, e, orient) {
			# read-pair orientation is consistent with artifact
			if (orient) {
				phi2 <- phi;
			} else {
				phi2 <- 0;
			}

			if (base == 0) {
				lp_ref_base_given_params(theta, phi2, e)
			} else if (base == 1) {
				lp_alt_base_given_params(theta, phi2, e)
			} else {
				lp_other_base_given_params(theta, phi2, e)
			}
		},
		d$base, d$e, d$oriented
	))
}

# compare deviance of model1: theta = theta.mle and model0: theta = 0
deviance_theta <- function(d, theta, phi) {
	2 * (
		# alternative model: theta = theta.mle
		lp_bases_given_params(d, theta, phi) - 
		# null model: theta = 0
		lp_bases_given_params(d, 0, phi)
	)
}

estimate_theta <- function(d, theta.0, phi, abstol=1e-6) {
	theta.opt <- optim(
		logit(theta.0),
		function(theta.real) {
			-lp_bases_given_params(d, logistic(theta.real), phi)
		},
		method = "Brent",
		lower = -15,
		upper = 15,
		control = list(abstol=abstol)
	);
	theta.hat <- logistic(theta.opt$par);

#	theta.opt <- optim(
#		theta.0,
#		function(theta.real) {
#			-lp_bases_given_params(d, logistic(theta.real), phi)
#		},
#		#method = "BFGS"
#		#method = "Nelder-Mead"
#		method = "L-BFGS-B"
#	);
#	theta.hat <- logistic(theta.opt$par);
#
#	theta.opt <- optimize(
#		function(theta.real) {
#			lp_bases_given_params(d, logistic(theta.real), phi)
#		},
#		interval = c(-100, 100),
#		maximum = TRUE
#	);
#	theta.hat <- logistic(theta.opt$maximum);
#
#	theta.opt <- optimize(
#		function(theta) {
#			lp_bases_given_params(d, theta, phi)
#		},
#		interval = c(0, 1),
#		maximum = TRUE
#	);
#	theta.hat <- theta.opt$maximum

	theta.hat
}

estimate_phi <- function(d, theta, phi.0, abstol=1e-6) {
	phi.opt <- optim(
		logit(phi.0),
		function(phi.real) {
			-lp_bases_given_params(d, theta, logistic(phi.real))
		},
		method = "Brent",
		lower = -15,
		upper = 15,
		control = list(abstol=abstol)
	);
	
	logistic(phi.opt$par)
}

estimate_theta_phi <- function(d, theta, phi, eps=1e-6, max.iter=20) {
	theta.old <- theta;
	phi.old <- phi;
	delta <- 1;
	converged <- FALSE;

	for (i in 1:max.iter) {
		theta <- estimate_theta(d, theta, phi);
		phi <- estimate_phi(d, theta, phi);
		delta <- (abs(theta.old - theta) + abs(phi.old - phi)) / 2;
		if (delta < eps) {
			converged <- TRUE;
			break;
		}
		theta.old <- theta;
		phi.old <- phi;
	}

	list(theta = theta, phi = phi)
}

estimate_initial_theta <- function(d) {
	counts <- table(d$base);
	if (length(counts) >= 2) {
		x <- counts[2];
	} else {
		x <- 0;
	}

	(x + 1) / (sum(counts) + 2)
}

#' Variant test with adjustment for orientation bias.
#'
#' Test whether variant allele frequency > 0,
#' with consideration for orientation bias, as specified
#' by the estimated orientation bias error rate.
#' 
#' @param d  \code{data.frame} with fields:
#'           base, e (esimated base error), and oriented
#'           (whether read-pair orientation is consistent with artifact mechanism)
#' @param phi.0  initial estimate of damage rate (causing orientation bias)
#' @return 
orient_bias_adjusted_variant_test <- function(data, phi.0, ...) {
	theta.0 <- estimate_initial_theta(data);

	mle <- estimate_theta_phi(data, theta.0, phi.0, ...);
	theta.hat <- mle$theta;
	phi.hat <- mle$phi;	

	dev <- deviance_theta(data, mle$theta, mle$phi);
	params <- c(df = 1);
	p <- pchisq(dev, df = params[1], lower.tail=FALSE);

	structure(
		list(
			method = "Orientation bias adjusted variant test with unknown extent of damage",
			statistic = c(D=dev), parameter = params, p.value = p, estimate = unlist(mle)
		),
		class = "htest"
	)
}

#' Variant test with adjustment for orientation bias.
#'
#' Test whether variant allele frequency > 0,
#' with consideration for orientation bias, as specified
#' by the estimated orientation bias error rate.
#' 
#' @param d  \code{data.frame} with fields:
#'           base, e (esimated base error), and oriented
#'           (whether read-pair orientation is consistent with artifact mechanism)
#' @param phi  estimated damage rate (causing orientation bias)
#' @return 
orient_bias_adjusted_variant_test_with_known_phi <- function(data, phi) {
	theta.0 <- estimate_initial_theta(data);

	theta.hat <- estimate_theta(data, theta.0, phi);
	dev <- deviance_theta(data, theta.hat, phi);
	params <- c(df = 1, phi = phi);
	p <- pchisq(dev, df = params[1], lower.tail=FALSE);

	structure(
		list(
			method = "Orientation bias adjusted variant test with known extent of damage",
			statistic = c(D=dev), parameter = params, p.value = p, estimate = theta.hat
		),
		class = "htest"
	)
}