hwep: Hardy-Weinberg Equilibrium in Polyploids

####################################
## Likelihood inference for tetraploids
####################################

#' Gamete frequencies from double reduction and allele frequency for tetraploids
#'
#' @param a The rate of double reduction
#' @param r The major allele frequency
#'
#' @return A vector of length 3. \code{p[[i]]} is the probability a gamete has
#' dosage \code{i-1}
#'
#' @author David Gerard
#'
#' @noRd
gam_from_a <- function(a, r) {
  stopifnot(length(a) == 1, length(r) == 1)
  stopifnot(r >= 0, r <= 1)
  c(p0 = (1 - 2 * r * (1 - a) / (2 + a)) * (1 - r),
    p1 = 4 * (1 - a) * r * (1 - r) / (2 + a),
    p2 = (3 * a / (2 + a) + 2 * r * (1 - a) / (2 + a)) * r)
}

#' Double reduction and allele frequency from gamete frequencies
#'
#' @param p The gamete frequencies.  \code{p[[i]]} is the probability a gamete
#'     has dosage \code{i-1}
#'
#' @return A vector of length 2. The first element is the double reduction
#' rate, the second element is the allele frequency.
#'
#' @author David Gerard
#'
#' @noRd
a_from_gam <- function(p) {
  stopifnot(length(p) == 3)
  stopifnot(abs(sum(p) - 1) < 10^-6)
  c(
    a = ((1 - 2 * p[[2]]) - (p[[3]] - p[[1]]) ^ 2) /
      ((1 + p[[2]]) - (p[[3]] - p[[1]]) ^ 2),
    r = (p[[2]] + 2 * p[[3]]) / 2
   )
}

#' Log-likelihood of tetraploid model under double reduction and allele freq
#'
#' @param a The double reduction parameter.
#' @param r The allele frequency.
#' @inheritParams hwetetra
#'
#' @noRd
lltetra <- function(a, r, nvec) {
  stopifnot(length(a) == 1, length(r) == 1)
  stopifnot(length(nvec) == 5)

  p <- gam_from_a(a = a, r = r)
  q <- stats::convolve(p, rev(p), type = "open")
  stopifnot(q > -10^-6)
  q[q < 0] <- 0

  stats::dmultinom(x = nvec, prob = q, log = TRUE)
}

#' HWE likelihood inference for tetraploids
#'
#' @param nvec A vector containing the observed genotype counts,
#'     where \code{nvec[[i]]} is the number of individuals with genotype
#'     \code{i-1}. This should be of length 5 (since we have tetraploids).
#' @param upperdr The upper bound on the double reduction rate. Defaults
#'     to 1/6, which is the maximum value under the complete equational
#'     segregation model (Mather, 1935).
#' @param addval The penalization applied to each genotype for the random
#'     mating hypothesis. This corresponds to the number of counts each
#'     genotype has \emph{a priori}. Defaults to \code{1 / 100}.
#' @param more A logical. Should we output more (\code{TRUE}) or less
#'     (\code{FALSE})?
#'
#' @return A list some or all of the following elements
#' \describe{
#'  \item{\code{alpha}}{The estimated double reduction rate.}
#'  \item{\code{r}}{The estimated allele frequency.}
#'  \item{\code{chisq_rm}}{The chi-square statistic against the null of random
#'      mating, with an alternative of arbitrary genotype frequencies.}
#'  \item{\code{df_rm}}{The degrees of freedom corresponding to \code{chisq_rm}.}
#'  \item{\code{p_rm}}{The p-value corresponding to \code{chisq_rm}.}
#'  \item{\code{chisq_hwe}}{The chi-square statistic against the null of only
#'      small deviations from HWE, with an alternative of arbitrary
#'      genotype frequencies.}
#'  \item{\code{df_hwe}}{The degrees of freedom corresponding to \code{chisq_hwe}.}
#'  \item{\code{p_hwe}}{The p-value corresponding to \code{chisq_hwe}.}
#'  \item{\code{chisq_ndr}}{The chi-square statistic against the null of no
#'      double reduction at HWE, with an alternative of only small deviations
#'      from HWE.}
#'  \item{\code{df_ndr}}{The degrees of freedom corresponding to \code{chisq_ndr}.}
#'  \item{\code{p_ndr}}{The p-value corresponding to \code{chisq_ndr}.}
#'  \item{\code{q_u}}{The genotype frequencies under the unrestricted model.}
#'  \item{\code{ll_u}}{The log-likelihood under the urestricted model.}
#'  \item{\code{q_rm}}{The genotype frequencies under the random mating model.}
#'  \item{\code{ll_rm}}{The log-likelihood under the random mating model.}
#'  \item{\code{q_hwe}}{The genotype frequencies under the small deviations
#'      from HWE model.}
#'  \item{\code{ll_hwe}}{The log-likelihood under the small deviations from HWE
#'      model.}
#'  \item{\code{q_ndr}}{The genotype frequencies under the no double reduction
#'      at HWE model.}
#'  \item{\code{ll_ndr}}{The log-likelihood under the no double reduction at
#'      HWE model.}
#' }
#' @author David Gerard
#'
#' @references
#' \itemize{
#'   \item{Mather, K. (1935). Reductional and equational separation of the chromosomes in bivalents and multivalents. Journal of Genetics, 30(1), 53-78. \doi{10.1007/BF02982205}}
#' }
#'
#' @examples
#' ## Generate data under HWE
#' set.seed(1)
#' alpha <- 0.1
#' r <- 0.4
#' q <- hwefreq(r = r, alpha = alpha, ploidy = 4)
#' nvec <- c(stats::rmultinom(n = 1, size = 500, prob = q))
#' hwetetra(nvec = nvec)
#'
#' @noRd
hwetetra <- function(nvec, upperdr = 1/6, addval = 1 / 100, more = FALSE) {
  stopifnot(length(nvec) == 5)
  stopifnot(nvec >= 0)
  stopifnot(is.vector(nvec))
  stopifnot(addval >= 0, length(addval) == 1)
  stopifnot(is.logical(more), length(more) == 1)

  ## return early if only one non-zero element
  if (sum(nvec > 0.5) <= 1) {
     retlist <- list(
        alpha = NA_real_,
        r = NA_real_,
        chisq_rm = NA_real_,
        df_rm = NA_real_,
        p_rm = NA_real_,
        chisq_hwe = NA_real_,
        df_hwe = NA_real_,
        p_hwe = NA_real_,
        chisq_ndr = NA_real_,
        df_ndr = NA_real_,
        p_ndr = NA_real_)

      if (more) {
        retlist$q_u <- NA_real_
        retlist$ll_u <- NA_real_
        retlist$q_rm <- NA_real_
        retlist$ll_rm <- NA_real_
        retlist$q_hwe <- NA_real_
        retlist$ll_hwe <- NA_real_
        retlist$q_ndr <- NA_real_
        retlist$ll_ndr <- NA_real_
      }

     return(retlist)
  }

  ## Unconstrained ---------------------------
  q_u <- nvec / sum(nvec)
  ll_u <- stats::dmultinom(x = nvec, prob = q_u, log = TRUE)

  ## Random mating ---------------------------
  p_rm <- rmem(nvec = nvec, addval = addval)
  q_rm <- stats::convolve(p_rm, rev(p_rm), type = "open")
  ll_rm <- stats::dmultinom(x = nvec, prob = q_rm, log = TRUE)

  ## Small deviation from HWE ----------------
  r <- sum(0:4 * nvec) / (sum(nvec) * 4)
  oout <- stats::optim(par = 0.1,
                       fn = lltetra,
                       method = "Brent",
                       lower = 0,
                       upper = upperdr,
                       nvec = nvec,
                       r = r,
                       control = list(fnscale = -1))
  alpha <- oout$par
  p_hwe <- gam_from_a(a = alpha, r = r)
  names(p_hwe) <- NULL
  q_hwe <- stats::convolve(p_hwe, rev(p_hwe), type = "open")
  ll_hwe <- oout$value

  ## No dr ------------------------------------
  p_ndr <- c((1 - r) ^ 2, 2 * r * (1-r), r ^ 2)
  q_ndr <- stats::convolve(p_ndr, rev(p_ndr), type = "open")
  ll_ndr <- stats::dmultinom(x = nvec, prob = q_ndr, log = TRUE)

  ## q names
  names(q_u) <- 0:4
  names(q_rm) <- 0:4
  names(q_hwe) <- 0:4
  names(q_ndr) <- 0:4

  ## Chi square tests -------------------------
  chisq_rm <- -2 * (ll_rm - ll_u)
  pval_rm <- stats::pchisq(q = chisq_rm, df = 2, lower.tail = FALSE)

  chisq_hwe <- -2 * (ll_hwe - ll_u)
  pval_hwe <- stats::pchisq(q = chisq_hwe, df = 2, lower.tail = FALSE)

  chisq_ndr <- -2 * (ll_ndr - ll_hwe)
  pval_ndr <- stats::pchisq(q = chisq_ndr, df = 1, lower.tail = FALSE)

  ## Return everything -----------------------
  retlist <- list(
    alpha = alpha,
    r = r,
    chisq_rm = chisq_rm,
    df_rm = 2,
    p_rm = pval_rm,
    chisq_hwe = chisq_hwe,
    df_hwe = 2,
    p_hwe = pval_hwe,
    chisq_ndr = chisq_ndr,
    df_ndr = 1,
    p_ndr = pval_ndr)

  if (more) {
    retlist$q_u <- q_u
    retlist$ll_u <- ll_u
    retlist$q_rm <- q_rm
    retlist$ll_rm <- ll_rm
    retlist$q_hwe <- q_hwe
    retlist$ll_hwe <- ll_hwe
    retlist$q_ndr <- q_ndr
    retlist$ll_ndr <- ll_ndr
  }

  return(retlist)
}