R/higher_freq.R

Defines functions outprob seg_multi ll_ao gam_to_df seg_lrt ll_gl ll_g gam_to_par par_to_gf par_to_gam gf_freq gamfreq gamfreq_seg gamfreq_pp n_pp_mix pp_seg_pats beta_bounds convolve_2 n_dr_params

Documented in beta_bounds gamfreq gf_freq n_pp_mix seg_lrt seg_multi

## Tolerance global variables
pkg_env <- new.env()
pkg_env$TOL_small <- sqrt(.Machine$double.eps) ## for everything but df calculations.
pkg_env$TOL_big <- .Machine$double.eps^(1/4) ## only used for df calculations

logit <- function (x) {
    log(x) - log(1 - x)
}

expit <- function (x) {
    1/(1 + exp(-x))
}

#' Returns all k-tuples that sum to n
#'
#' @param n The sum of the tuple
#' @param k The tuple number
#'
#' @return A matrix, where the row is a k-tuple that sums to n. This
#'    returns all possible such k-tuples
#'
#' @author David Gerard
#'
#' @noRd
all_multinom <- function (n, k) {
  if (k == 0) {
    return(NULL)
  }
  else if (k == 1) {
    return(n)
  }
  mat_list <- list()
  for (i in 0:n) {
    mat_list[[i + 1]] <- cbind(i, all_multinom(n - i, k - 1))
  }
  mat <- do.call(what = rbind, args = mat_list)
  colnames(mat) <- NULL
  mat
}

#' Number of double reduction mixing components.
#'
#' If alpha_i is the probability that there are i copies of identical by
#' double reduction alleles in a gamete, then this will return the max
#' i. So there are really this plus 1 alphas (if you include alpha_0).
#'
#' @param ploidy Ploidy
#'
#' @author David Gerard
#'
#' @noRd
n_dr_params <- function(ploidy) {
  floor(ploidy / 4)
}

#' Conventional convolution (with type = open) for probability distributions.
#'
#' @param p1 Discrete distribution 1
#' @param p2 Discrete distribution 2
#' @param nudge If new distribution is less than nudge, returns nudge.
#'
#' @author David Gerard
#'
#' @noRd
convolve_2 <- function(p1, p2, nudge = sqrt(.Machine$double.eps)) {
  q <- stats::convolve(x = p1, y = rev(p2), type = "open")
  q[q < nudge] <- nudge
  q <- q / sum(q)
  return(q)
}

#' Upper bounds on double reduction rates.
#'
#' Provides the upper bounds on the double reduction rates based on the
#' formulas in Huang et al. (2019). There are two upper bounds provided.
#' The upper bound from complete equational separation is higher than
#' the upper bound from the pure random chromatid segregation.
#'
#' @param ploidy The ploidy
#' @param model Either complete equational segregation (\code{"ces"}) (Mather, 1935)
#'    or pure random chromatid segregation \code{"prcs"})  (Haldane, 1930). See also
#'    Huang et al. (2019).
#'
#' @return A vector of length \code{floor(ploidy / 4)} containing the upper bounds
#'    on the rates of double reduction. The \code{i}the element is the
#'    upper bound on the probability that there are \code{i} pairs of
#'    identical by double reduction alleles in a gamete.
#'
#' @references
#' \itemize{
#'   \item{Haldane, J. B. S. (1930). Theoretical genetics of autopolyploids. \emph{Journal of genetics}, 22, 359-372. \doi{10.1007/BF02984197}}
#'   \item{Huang, K., Wang, T., Dunn, D. W., Zhang, P., Cao, X., Liu, R., & Li, B. (2019). Genotypic frequencies at equilibrium for polysomic inheritance under double-reduction. \emph{G3: Genes, Genomes, Genetics}, 9(5), 1693-1706. \doi{10.1534/g3.119.400132}}
#'   \item{Mather, K. (1935). Reductional and equational separation of the chromosomes in bivalents and multivalents. \emph{Journal of genetics}, 30, 53-78. \doi{10.1007/BF02982205}}
#' }
#'
#' @examples
#' drbounds(4)
#' drbounds(6)
#' drbounds(8)
#' drbounds(10)
#'
#' @author David Gerard
#'
#' @export
drbounds <- function (ploidy, model = c("ces", "prcs")) {
  model <- match.arg(model)
  stopifnot(ploidy > 2)
  stopifnot(ploidy%%2 == 0)
  ibdr <- floor(ploidy/4)
  if (model == "ces") {
    alpha <- rep(NA_real_, length = ibdr)
    for (i in seq_along(alpha)) {
        jvec <- i:ibdr
        alpha[[i]] <- exp(
          log_sum_exp(
            (ploidy/2 - 3 * jvec) * log(2) + lchoose(jvec, i) +
              lchoose(ploidy/2, jvec) +
              lchoose(ploidy/2 - jvec, ploidy/2 - 2 * jvec) -
              lchoose(ploidy, ploidy/2)
            )
          )
    }
  } else if (model == "prcs") {
    ivec <- seq_len(ibdr)
    alpha <- exp(
      lchoose(ploidy, ploidy / 2 - ivec) + lchoose(ploidy / 2 - ivec,  ivec) +
        (ploidy / 2 -  2 * ivec) * log(2) - lchoose(2 * ploidy, ploidy / 2)
    )
  }
  return(alpha)
}

#' Bounds on the distortion at simplex loci caused by double reduction.
#'
#' The frequency of (nullplex, simplex, duplex) gametes is
#' (.5 + beta, .5 - 2 * beta, beta). This function returns the upper
#' bound on beta under two models.
#'
#' Returns the upper bound on the probability of a gamete with a genotype of
#' 2 when the parent has a genotype of 1. This is based on two models.
#' The upper bound from complete equational separation is higher than
#' the upper bound from the pure random chromatid segregation. See
#' Huang et al (2019) for a description of these models.
#'
#' @return The upper bound on beta.
#'
#' @inheritParams drbounds
#'
#' @author David Gerard
#'
#' @examples
#' beta_bounds(4)
#' beta_bounds(6)
#' beta_bounds(8)
#' beta_bounds(10)
#'
#' @references
#' \itemize{
#'   \item{Huang, K., Wang, T., Dunn, D. W., Zhang, P., Cao, X., Liu, R., & Li, B. (2019). Genotypic frequencies at equilibrium for polysomic inheritance under double-reduction. \emph{G3: Genes, Genomes, Genetics}, 9(5), 1693-1706. \doi{10.1534/g3.119.400132}}
#' }
#'
#' @export
beta_bounds <- function(ploidy, model = c("ces", "prcs")) {
  model <- match.arg(model)
  dvec <- drbounds(ploidy = ploidy, model = model)
  return(sum(seq_along(dvec) * dvec) / ploidy)
}

#' Simplex to real function
#'
#' @param q A point on the simplex
#'
#' @return A point on the reals, of length one less than q.
#'
#' @references
#' \itemize {
#'  \item{Betancourt, M. J. (2010). Cruising the simplex: Hamiltonian Monte Carlo and the Dirichlet distribution. \emph{arXiv}. \doi{10.48550/arXiv.1010.3436}}
#'  \item{\url{https://mc-stan.org/docs/2_19/reference-manual/simplex-transform-section.html}}
#' }
#'
#' @author David Gerard
#'
#' @examples
#' simplex_to_real(c(0.5, 0.3, 0.2))
#' real_to_simplex(simplex_to_real(c(0.5, 0.3, 0.2)))
#'
#' @noRd
simplex_to_real <- function (q) {
    K <- length(q)
    stopifnot(K > 1)
    z <- q/(1 - c(0, cumsum(q)[-K]))
    y <- logit(z[-K]) - log(1/(K - 1:(K - 1)))
    return(y)
}

#' Real to simplex function
#'
#' @param y A point on the reals
#'
#' @return A point on the simplex, of length one greater than y.
#'
#' @references
#' \itemize {
#'  \item{Betancourt, M. J. (2010). Cruising the simplex: Hamiltonian Monte Carlo and the Dirichlet distribution. \emph{arXiv}. \doi{10.48550/arXiv.1010.3436}}
#'  \item{\url{https://mc-stan.org/docs/2_19/reference-manual/simplex-transform-section.html}}
#' }
#'
#' @author David Gerard
#'
#' @examples
#' real_to_simplex(c(1, 3))
#' simplex_to_real(real_to_simplex(c(1, 3)))
#'
#' @noRd
real_to_simplex <- function (y) {
    stopifnot(length(y) > 0)
    K <- length(y) + 1
    z <- c(expit(y + log(1/(K - 1:(K - 1)))), 1)
    x <- rep(NA_real_, length.out = K)
    cumval <- 0
    for (i in seq_len(K)) {
        x[[i]] <- (1 - cumval) * z[[i]]
        cumval <- cumval + x[[i]]
    }
    return(x)
}

#' Gamete frequencies from meiosis under just double reduction
#'
#' Returns the probability of a gamete dosage given the parent dosage
#' (\code{g}), the parent ploidy (\code{ploidy}), and the double reduction
#' parameter(s) (\code{alpha}). This is for biallelic loci.
#'
#' @param alpha A numeric vector containing the double reduction parameter(s).
#'     This should be a vector of length \code{floor(ploidy/4)} where \code{alpha[i]}
#'     is the probability of exactly \code{i} pairs of IBDR alleles
#'     being in the gamete. Note that \code{sum(alpha)} should be less than
#'     1, as \code{1 - sum(alpha)} is the probability of no double reduction.
#' @param g The dosage of the parent. Should be an integer between \code{0}
#'     and \code{ploidy}.
#' @param ploidy The ploidy of the species. This should be an even positive
#'     integer.
#' @param log_p A logical. Should we return the log-probability (\code{TRUE})
#'     or not (\code{FALSE})? Defaults to \code{FALSE}.
#'
#' @return A vector of length \code{ploidy/2 + 1}, containing the (log)
#'     probabilities of a gamete carrying a dosage of 0,1,...,ploidy/2+1 from a
#'     parent of dosage \code{G} who has ploidy \code{ploidy} and a
#'     double reduction rate \code{alpha}.
#'
#' @author David Gerard
#'
#' @references
#' \itemize{
#' \item{Fisher, R. A., & Mather, K. (1943). The inheritance of style length in Lythrum salicaria. Annals of Eugenics, 12(1), 1-23. \doi{10.1111/j.1469-1809.1943.tb02307.x}}
#' \item{Huang, K., Wang, T., Dunn, D. W., Zhang, P., Cao, X., Liu, R., & Li, B. (2019). Genotypic frequencies at equilibrium for polysomic inheritance under double-reduction. G3: Genes, Genomes, Genetics, 9(5), 1693-1706. \doi{10.1534/g3.119.400132}}
#' }
#'
#' @noRd
#'
#' @examples
#' gamfreq_dr(alpha = 1/6, g = 2, ploidy = 4)
gamfreq_dr <- function (alpha, g, ploidy, log_p = FALSE) {
    x <- 0:(ploidy/2)
    stopifnot(length(ploidy) == 1L, length(g) == 1L, length(log_p) == 1L)
    stopifnot(is.logical(log_p))
    stopifnot(ploidy%%2 == 0)
    stopifnot(0 <= g, g <= ploidy)
    stopifnot(0 <= x, x <= ploidy/2)
    ibdr <- floor(ploidy/4)
    stopifnot(length(alpha) == ibdr)
    if (ploidy == 2) {
        retvec <- stats::dhyper(
          x = x,
          m = g,
          n = ploidy - g,
          k = ploidy / 2,
          log = log_p)
        return(retvec)
    }
    if (all(abs(alpha) < pkg_env$TOL_small)) {
        retvec <- stats::dhyper(
          x = x,
          m = g,
          n = ploidy - g,
          k = ploidy / 2,
          log = log_p)
        return(retvec)
    }
    ijmat <- cbind(utils::combn(x = 0:ibdr, m = 2), matrix(rep(0:ibdr, each = 2), nrow = 2))
    jvec <- ijmat[1, ]
    ivec <- ijmat[2, ]
    alpha <- c(1 - sum(alpha), alpha)
    alphavec <- alpha[ivec + 1]
    retvec <- rep(NA_real_, length = length(x))
    for (k in seq_along(x)) {
        retvec[[k]] <- sum(exp(lchoose(g, jvec) + lchoose(g -
            jvec, x[[k]] - 2 * jvec) + lchoose(ploidy - g, ivec -
            jvec) + lchoose(ploidy - g - (ivec - jvec), ploidy/2 -
            x[[k]] - 2 * (ivec - jvec)) - lchoose(ploidy, ivec) -
            lchoose(ploidy - ivec, ploidy/2 - 2 * ivec)) * alphavec)
    }
    retvec[is.nan(retvec)] <- -Inf
    if (log_p) {
        retvec <- log(retvec)
    }
    return(retvec)
}

#' Genotype frequencies of an F1 population under just double reduction
#'
#' Returns the distribution of an offspring dosages given
#' parental dosages (\code{g1} and \code{g2}), the ploidy of the
#' species (\code{ploidy}), and the double reduction parameter(s)
#' (\code{alpha} and \code{alpha2}).
#'
#' @inheritParams gamfreq_dr
#' @param alpha2 The double reduction rate(s) for the second parent. By default,
#'     this is the same as the first parent.
#' @param g1 The dosage of parent 1. Should be an integer between \code{0}
#'     and \code{ploidy}.
#' @param g2 The dosage of parent 2. Should be an integer between \code{0}
#'     and \code{ploidy}.
#'
#' @return A vector of probabilities. The \code{i}th element is the
#'     probability that the offspring will have dosage \code{i-1}.
#'
#' @author David Gerard
#'
#' @noRd
#'
#' @examples
#' f1_gf_dr(alpha = c(0.5, 0.1), g1 = 4, g2 = 5, ploidy = 8)
#'
f1_gf_dr <- function (alpha, alpha2 = alpha, g1, g2, ploidy) {
    stopifnot(length(g1) == 1L, length(g2) == 1L, length(ploidy) ==
        1L)
    stopifnot(ploidy%%2 == 0)
    stopifnot(0 <= g1, g1 <= ploidy)
    stopifnot(0 <= g2, g2 <= ploidy)
    ibdr <- floor(ploidy / 4)
    stopifnot(length(alpha) == ibdr)
    stopifnot(length(alpha2) == ibdr)
    p1gamprob <- gamfreq_dr(
      alpha = alpha,
      g = g1,
      ploidy = ploidy,
      log_p = FALSE)
    p2gamprob <- gamfreq_dr(
      alpha = alpha2,
      g = g2,
      ploidy = ploidy,
      log_p = FALSE)
    zygdist <- convolve_2(p1 = p1gamprob, p2 = p2gamprob)
    return(zygdist)
}


#' Returns all gamete segregation patterns under preferential pairing
#'
#' @param ploidy The ploidy
#'
#' @return A data frame with all of the gamete frequencies possible. Columsn include
#'   \describe{
#'     \item{g}{parent genotype.}
#'     \item{m}{pairing configuration.}
#'     \item{p}{gamete frequencies.}
#'     \item{ploidy}{The ploidy}
#'   }
#'
#' @author David Gerard
#'
#' @noRd
pp_seg_pats <- function(ploidy) {
  stopifnot(ploidy%%2 == 0, ploidy >= 2)
  mmat <- all_multinom(n = ploidy / 2, k = 3)
  df <- data.frame(
    ploidy = ploidy,
    g = mmat[, 2] + 2 * mmat[, 3],
    m = I(lapply(seq_len(nrow(mmat)), function(i) mmat[i,])))
  df <- df[order(df$g), ]
  df$p <- vector(mode = "list", length = nrow(df))
  rownames(df) <- NULL
  for (i in seq_len(nrow(df))) {
    df$p[[i]] <- stats::dbinom(x = 0:(ploidy/2) - df$m[[i]][[3]], size = df$m[[i]][[2]], prob = 0.5)
  }
  return(df)
}

#' Number of mixture components
#'
#' The number of disomic inheritance patterns for a given ploidy and a given
#' parental dosage. See also \code{\link{seg}} for the list of all possible
#' disomic inheritance patterns for even ploidies up to 20.
#'
#' @param g parent genotype
#' @param ploidy parent ploidy
#'
#' @return The number of mixture components.
#'
#' @examples
#' n_pp_mix(g = 0, ploidy = 4)
#' n_pp_mix(g = 1, ploidy = 4)
#' n_pp_mix(g = 2, ploidy = 4)
#' n_pp_mix(g = 3, ploidy = 4)
#' n_pp_mix(g = 4, ploidy = 4)
#'
#' n_pp_mix(g = 0, ploidy = 6)
#' n_pp_mix(g = 1, ploidy = 6)
#' n_pp_mix(g = 2, ploidy = 6)
#' n_pp_mix(g = 3, ploidy = 6)
#' n_pp_mix(g = 4, ploidy = 6)
#' n_pp_mix(g = 5, ploidy = 6)
#' n_pp_mix(g = 6, ploidy = 6)
#'
#' @export
n_pp_mix <- function(g, ploidy) {
  ifelse(g >= ploidy / 2, ploidy / 2 - ceiling(g / 2) + 1, floor(g / 2) + 1)
}

#' Gamete frequencies from mixture of pairing configurations.
#'
#' @param gamma The mixing probabilities for the pairing configurations.
#' @param g The parent gentoype.
#' @param ploidy The ploidy of the parent.
#'
#' @author David Gerard
#'
#' @return A vector of gamete frequencies.
#'
#' @noRd
gamfreq_pp <- function(gamma, g, ploidy) {
  stopifnot(
    length(gamma) == n_pp_mix(g = g, ploidy = ploidy),
    abs(sum(gamma) - 1) < pkg_env$TOL_big,
    gamma >= 0)
  plist <- segtest::seg[segtest::seg$ploidy == ploidy & (segtest::seg$mode == "disomic" | segtest::seg$mode == "both") & segtest::seg$g == g, ]$p
  pvec <- rep(0, length.out = ploidy / 2 + 1)
  for (i in seq_along(plist)) {
    pvec <- pvec + plist[[i]] * gamma[[i]]
  }
  return(pvec)
}

#' Gamete frequencies from a mixture of pairing configurations and polysomic
#'
#' This is the same as \code{\link{gamfreq_pp}()} but adds a mixture
#' component of polysomic inheritance at the maximum double reduction rate.
#' The last mixture component is the one for polysomic inheritance, with
#' the effects of double reduction.
#'
#' @inheritParams gamfreq_pp
#' @param alpha The double reduction rate of the polysomic component.
#'
#' @author David Gerard
#'
#' @examples
#' gamfreq_seg(gamma = c(0.1, 0.3, 0.6), g = 2, ploidy = 6, alpha = 0.3)
#'
#'
#' @noRd
gamfreq_seg <- function(gamma, g, ploidy, alpha) {
  stopifnot(
    length(gamma) == n_pp_mix(g = g, ploidy = ploidy) + 1,
    abs(sum(gamma) - 1) < pkg_env$TOL_big,
    gamma >= 0)
  stopifnot(alpha >= 0, length(alpha) == floor(ploidy / 4))
  plist <- segtest::seg[segtest::seg$ploidy == ploidy & (segtest::seg$mode == "disomic" | segtest::seg$mode == "both") & segtest::seg$g == g, ]$p
  pvec <- rep(0, length.out = ploidy / 2 + 1)
  for (i in seq_along(plist)) {
    pvec <- pvec + plist[[i]] * gamma[[i]]
  }
  pvec <- pvec + gamfreq_dr(alpha = alpha, g = g, ploidy = ploidy, log_p = FALSE) * gamma[[length(gamma)]]
  return(pvec)
}

#' Gamete frequencies under a generalized model
#'
#' Returns the gamete frequencies for autopolyploids, allopolyploids,
#' and segmental allopolyploids, accounting for the effects of double
#' reduction and partial preferential pairing.
#'
#' @section Models:
#'
#' If \code{type = "polysomic"}, then the gamete frequencies correspond
#' to those of Huang et al (2019). Those formulas are for general multiallelic
#' loci, so see also Appendix G of Gerard (2022) for special case of
#' biallelic loci. The relevant parameter is \code{alpha}, a vector of
#' length \code{floor(ploidy / 4)}, where \code{alpha[[i]]} is the
#' probability that there are \code{i} pairs of double reduced alleles
#' in a gamete. The theoretical upper bound on \code{alpha} is given in
#' \code{\link{drbounds}()}.
#'
#' If \code{type = "mix"} and \code{add_dr = FALSE}, then the gamete
#' frequencies correspond to the pairing configuration model of
#' Gerard et al (2018). This model states that the gamete frequencies are
#' a convex combination of the disomic inheritance frequencies. The weights
#' of this convex combination are provided in the \code{gamma} parameter. The
#' total number of disomic segregation patterns is given by
#' \code{\link{n_pp_mix}()}. The order of these segregation patterns used is
#' the order in \code{\link{seg}}.
#'
#' The model for \code{type = "mix"} and \code{add_dr = TRUE} is the same
#' as for \code{type = "mix"} and \code{add_dr = FALSE} \emph{except} at
#' parental simplex loci. At such loci, there are no effects of preferential
#' pairing, and so the option \code{add_dr = TRUE} allows for the effects
#' of double reduction at simplex loci. The relevant parameter here is
#' \code{beta}. The first three gamete frequencies at simplex loci are
#' \code{c(0.5 + beta, 0.5 - 2 * beta, beta)}, and the rest are 0. The
#' upper bound on beta for two different models are given by
#' \code{\link{beta_bounds}()}.
#'
#' @param g Parent genotype.
#' @param ploidy Parent ploidy. Should be even, and between 2 and 20 (inclusive).
#'    Let me know if you need the ploidy to be higher. I can update
#'    the package really easily.
#' @param gamma The mixture proportions for the pairing configurations.
#'    The proportions are in the same order the configurations in
#'    \code{\link{seg}}. See Gerard et al (2018) for details on
#'    pairing configurations.
#' @param alpha The double reduction rate(s) (if using). Defaults to 0's.
#' @param beta The double reduction adjustment for simplex markers if
#'    \code{type = "mix"} and \code{add_dr = TRUE}. Assumed to be 0 by default.
#' @param type Either \code{"mix"}, meaning a mixture model of
#'    pairing configurations, or \code{"polysomic"} for polysomic inheritance.
#' @param add_dr A logical. If \code{type = "polysomic"}, then the double
#'    double reduction rate (\code{"alpha"}) will be used no matter the
#'    value of \code{add_dr}, so set \code{alpha = 0} if you don't want it.
#'    But if \code{type = "mix"} then we will incorporate double reduction
#'    only in simplex markers (where it matters the most, and where
#'    preferential pairing does not operate).
#'
#' @return The vector of gamete frequencies. Element \code{i} is the
#'     probability a gamete has genotype \code{i - 1}.
#'
#' @references
#' \itemize{
#'   \item{Gerard, D. (2023). Double reduction estimation and equilibrium tests in natural autopolyploid populations. \emph{Biometrics}, 79(3), 2143-2156. \doi{10.1111/biom.13722}}
#'   \item{Gerard, D., Ferrão, L. F. V., Garcia, A. A. F., & Stephens, M. (2018). Genotyping polyploids from messy sequencing data. \emph{Genetics}, 210(3), 789-807. \doi{10.1534/genetics.118.301468}}
#'   \item{Huang, K., Wang, T., Dunn, D. W., Zhang, P., Cao, X., Liu, R., & Li, B. (2019). Genotypic frequencies at equilibrium for polysomic inheritance under double-reduction. \emph{G3: Genes, Genomes, Genetics}, 9(5), 1693-1706. \doi{10.1534/g3.119.400132}}
#' }
#'
#' @author David Gerard
#'
#' @export
#'
#' @examples
#' ## Various duplex models
#' gamfreq(g = 2, ploidy = 4, gamma = c(0, 1), type = "mix")
#' gamfreq(g = 2, ploidy = 4, gamma = c(1, 0), type = "mix")
#' gamfreq(g = 2, ploidy = 4, gamma = c(0.5, 0.5), type = "mix")
#' gamfreq(g = 2, ploidy = 4, alpha = 0, type = "polysomic")
#' gamfreq(g = 2, ploidy = 4, alpha = 1/6, type = "polysomic")
#'
#' ## Various simplex models
#' gamfreq(g = 1, ploidy = 4, beta = 1/24, gamma = 1, type = "mix", add_dr = TRUE)
#' gamfreq(g = 1, ploidy = 4, alpha = 1/6, type = "polysomic")
#' gamfreq(g = 1, ploidy = 4, gamma = 1, type = "mix", add_dr = FALSE)
#' gamfreq(g = 1, ploidy = 4, alpha = 0, type = "polysomic")
gamfreq <- function(
    g,
    ploidy,
    gamma = NULL,
    alpha = NULL,
    beta = NULL,
    type = c("mix", "polysomic"),
    add_dr = TRUE) {

  if (g == 0) {
    pvec <- c(1, rep(x = 0, times = ploidy / 2))
    return(pvec)
  } else if (g == ploidy) {
    pvec <- c(rep(x = 0, times = ploidy / 2), 1)
    return(pvec)
  }

  ## Check input
  stopifnot(ploidy >= 2, ploidy %% 2 == 0, ploidy <= 20)
  stopifnot(g >= 0, g <= ploidy)
  if (!is.null(gamma)) {
    stopifnot(
      length(gamma) == n_pp_mix(g = g, ploidy = ploidy),
      abs(sum(gamma) - 1) < pkg_env$TOL_big,
      gamma >= 0
    )
  }
  if (!is.null(alpha)) {
    stopifnot(
      alpha >= 0,
      sum(alpha) <= 1,
      length(alpha) == floor(ploidy / 4)
    )
  }
  if (!is.null(beta)) {
    stopifnot(
      beta >= 0,
      beta <= 0.25,
      length(beta) == 1
    )
  }
  stopifnot(is.logical(add_dr), length(add_dr) == 1)
  type <- match.arg(type)
  if (type == "mix") {
    stopifnot(!is.null(gamma))
    if (add_dr) {
      if (is.null(beta)) {
        beta <- 0
      }
    }
  } else if (type == "polysomic") {
    if (is.null(alpha)) {
      alpha <- rep(0, times = floor(ploidy / 4))
    }
  }

  ## calculate gamete frequencies
  if (type == "polysomic") {
    pvec <- gamfreq_dr(alpha = alpha, g = g, ploidy = ploidy, log_p = FALSE)
  } else if (type == "mix" && add_dr && (g == 1 | g == ploidy - 1)) {
    if (g == 1) {
      pvec <- c(
        0.5 + beta,
        0.5 - 2 * beta,
        beta,
        rep(0, times = ploidy / 2 - 2)
      )
    } else if (g == ploidy - 1) {
      pvec <- c(
        rep(0, times = ploidy / 2 - 2),
        beta,
        0.5 - 2 * beta,
        0.5 + beta
      )
    }
  } else if (type == "mix") {
    pvec <- gamfreq_pp(gamma = gamma, g = g, ploidy = ploidy)
  }

  return(pvec)
}

#' Genotype frequencies of an F1 population under a generalized model.
#'
#' @param p1_g,p1_ploidy,p1_gamma,p1_alpha,p1_beta,p1_type,p1_add_dr The first parent's version of the parameters in \code{\link{gamfreq}()}.
#' @param p2_g,p2_ploidy,p2_gamma,p2_alpha,p2_beta,p2_type,p2_add_dr The second parent's version of the parameters in \code{\link{gamfreq}()}.
#' @param pi The proportion of outliers.
#' @param nudge Zeros go to nudge
#'
#' @author David Gerard
#'
#' @seealso \code{\link{gamfreq}()}.
#'
#' @return A vector of genotype frequencies. Element \code{i} is the probability
#'     and offspring has genotype \code{i - 1}.
#'
#' @export
#'
#' @examples
#' q <- gf_freq(
#'   p1_g = 2,
#'   p1_ploidy = 4,
#'   p1_gamma = c(0.1, 0.9),
#'   p1_type = "mix",
#'   p2_g = 2,
#'   p2_ploidy = 6,
#'   p2_alpha = 0.1,
#'   p2_type = "polysomic",
#'   pi = 0.05)
#'
gf_freq <- function(
    p1_g,
    p1_ploidy,
    p1_gamma = NULL,
    p1_alpha = NULL,
    p1_beta = NULL,
    p1_type = c("mix", "polysomic"),
    p1_add_dr = TRUE,
    p2_g,
    p2_ploidy,
    p2_gamma = NULL,
    p2_alpha = NULL,
    p2_beta = NULL,
    p2_type = c("mix", "polysomic"),
    p2_add_dr = TRUE,
    pi = 0,
    nudge = sqrt(.Machine$double.eps)) {
  TOL <- pkg_env$TOL_small
  p1_type <- match.arg(p1_type)
  p2_type <- match.arg(p2_type)
  p1 <- gamfreq(
    g = p1_g,
    ploidy = p1_ploidy,
    gamma = p1_gamma,
    alpha = p1_alpha,
    beta = p1_beta,
    type = p1_type,
    add_dr = p1_add_dr)
  p2 <- gamfreq(
    g = p2_g,
    ploidy = p2_ploidy,
    gamma = p2_gamma,
    alpha = p2_alpha,
    beta = p2_beta,
    type = p2_type,
    add_dr = p2_add_dr)
  q <- convolve_2(p1 = p1, p2 = p2, nudge = nudge)
  q <- q * (1 - pi) + pi / length(q)
  return(q)
}

#' Convert parameterization from something optim() can use to something
#' gamfreq() can use.
#'
#' Note that if `rule` has a a parameter (`alpha`, `beta`, or `gamma`), then
#' that value is assumed for `gam`, not a value from `par`. It should be
#' `NULL` in `rule` to take it from `par`.
#'
#' @param rule A list of length three. The first element gives the model
#'     of parent 1. The second element gives the model of parent 2.
#'     The third element is a logiical on whether we add an outlier or now.
#'     The first two lists have elements
#'     \describe {
#'       \item{ploidy}{The parent's ploidy.}
#'       \item{g}{The parent dosage.}
#'       \item{type}{Either "mix", "mix_dr", or "polysomic".}
#'       \item{alpha}{The fixed value of alpha (not from par}
#'       \item{beta}{The fixed value of beta (not from par)}
#'       \item{gamma}{The fixed value of gamma (not from par)}
#'     }
#'     The third list has the following elements
#'     \describe{
#'       \item{outlier}{A logical on whether to include and outlier.}
#'       \item{pi}{The fixed value of pi (not from par).}
#'     }
#' @param par A vector of parameters to be converted based on rule.
#'    \itemize{
#'    \item{
#'      If \code{type = "mix"}, then
#'      gamma is assumed to be the real-line parameterization (see
#'      \code{\link{real_to_simplex}()} and \code{\link{simplex_to_real}()})
#'      }
#'    \item{
#'      If \code{type = "mix_dr"}, then this is the same as "mix" but for simplex
#'      markers we allow for beta.
#'      Beta (if g = 1) is not real-line transformed since there are bounds on it. See
#'      \code{\link{beta_bounds}()}.
#'      }
#'    \item{
#'       If \code{type = "polysomic"}, then only alpha is specified. This is
#'       not real-line specified, since there are natural bounds on alpha.
#'       See \code{\link{drbounds}()}.
#'      }
#'    \item{
#'       All of parent 1's parameters are in the vector, then all of parent 2's
#'       parameters.
#'       }
#'    \item{
#'       If \code{outlier = TRUE}, then the outlier proportion is the last
#'       element of \code{par}
#'       }
#'    }
#' @param gamma_type If \code{gamma_type = "real"}, then it assumes that the
#'    gamma values are transformed on the real line. Otherwise, it assumes
#'    the gamma values are on the simplex.
#'
#' @return A list of length 3. The first contains the parameters from
#'     gamfreq() for parent 1, the second contains the parameters from
#'     gamfreq of parent 2. The third contains the outlier proportion.
#'     These parameters in the parent lists are \code{ploidy}, \code{g}, \code{gamma},
#'     \code{beta}, \code{alpha}, \code{type}, and \code{add_dr}.
#'     See \code{\link{gamfreq}()} for details on these parameters. The third
#'     is a list with elements \code{outlier} (a logical on whether
#'     outliers are modeled) and \code{pi} (the outlier proportion).
#'
#' @author David Gerard
#'
#' @examples
#' rule <- list(
#'   list(ploidy = 4, g = 2, type = "mix"),
#'   list(ploidy = 8, g = 4, type = "polysomic"),
#'   list(outlier = TRUE)
#'   )
#' par <- c(-2, 0.1, 0.2, 0.03)
#' par_to_gam(par = par, rule = rule)
#'
#' rule <- list(
#'   list(ploidy = 4, g = 0, type = "mix"),
#'   list(ploidy = 8, g = 1, type = "mix"),
#'   list(outlier = TRUE)
#'   )
#' par <- c(0.1)
#' par_to_gam(par = par, rule = rule)
#'
#' @noRd
par_to_gam <- function(par, rule, gamma_type = c("real", "simplex")) {
  gamma_type <- match.arg(gamma_type)
  gam <- list()
  gam[[1]] <- list(
    ploidy = rule[[1]]$ploidy,
    g = rule[[1]]$g,
    alpha = rule[[1]]$alpha,
    beta = rule[[1]]$beta,
    gamma = rule[[1]]$gamma,
    type = NULL,
    add_dr = NULL)
  gam[[2]] <- list(
    ploidy = rule[[2]]$ploidy,
    g = rule[[2]]$g,
    alpha = rule[[2]]$alpha,
    beta = rule[[2]]$beta,
    gamma = rule[[2]]$gamma,
    type = NULL,
    add_dr = NULL)
  gam[[3]] <- list(
    outlier = rule[[3]]$outlier,
    pi = rule[[3]]$pi
  )

  ## Do parent 1 ----
  cindex <- 1 ## keeps track of where we are in par
  if (rule[[1]]$type == "polysomic") {
    gam[[1]]$type <- "polysomic"
    gam[[1]]$add_dr <- FALSE
    if (rule[[1]]$g != 0 && rule[[1]]$g != rule[[1]]$ploidy) {
      if (is.null(rule[[1]]$alpha)) {
        ndr <- floor(rule[[1]]$ploidy / 4)
        gam[[1]]$alpha <- par[cindex:(cindex + ndr - 1)]
        cindex <- cindex + ndr
      } else {
        gam[[1]]$alpha <- rule[[1]]$alpha
      }
    }
  } else if (rule[[1]]$type == "mix_dr" && (rule[[1]]$g == 1 || rule[[1]]$g == rule[[1]]$ploidy - 1)) {
    gam[[1]]$type <- "mix"
    gam[[1]]$add_dr <- TRUE
    gam[[1]]$gamma <- 1
    if (is.null(rule[[1]]$beta)) {
      gam[[1]]$beta <- par[[cindex]]
      cindex <- cindex + 1
    } else {
      gam[[1]]$beta <- rule[[1]]$beta
    }
  } else if (rule[[1]]$type == "mix" && (rule[[1]]$g == 1 || rule[[1]]$g == rule[[1]]$ploidy - 1)) {
    gam[[1]]$type <- "mix"
    gam[[1]]$gamma <- 1
    gam[[1]]$add_dr <- FALSE
  } else if ((rule[[1]]$type == "mix" || rule[[1]]$type == "mix_dr")) {
    gam[[1]]$type <- "mix"
    gam[[1]]$add_dr <- FALSE
    if (rule[[1]]$g != 0 && rule[[1]]$g != rule[[1]]$ploidy) {
      if (is.null(rule[[1]]$gamma)) {
        npp <- n_pp_mix(g = rule[[1]]$g, ploidy = rule[[1]]$ploidy)
        if (gamma_type == "real") {
          gam[[1]]$gamma <- real_to_simplex(par[cindex:(cindex + npp - 2)])
          cindex <- cindex + npp - 1
        } else if (gamma_type == "simplex") {
          gam[[1]]$gamma <- par[cindex:(cindex + npp - 1)]
          cindex <- cindex + npp
        }
      } else {
        gam[[1]]$gamma <- rule[[1]]$gamma
      }
    }
  } else {
    stop("par_to_gam 1: how did you get here?")
  }

  ## Do parent 2 ----
  if (rule[[2]]$type == "polysomic") {
    gam[[2]]$type <- "polysomic"
    gam[[2]]$add_dr <- FALSE
    if (rule[[2]]$g != 0 && rule[[2]]$g != rule[[2]]$ploidy) {
      if (is.null(rule[[2]]$alpha)) {
        ndr <- floor(rule[[2]]$ploidy / 4)
        gam[[2]]$alpha <- par[cindex:(cindex + ndr - 1)]
        cindex <- cindex + ndr
      } else {
        gam[[2]]$alpha <- rule[[2]]$alpha
      }
    }
  } else if (rule[[2]]$type == "mix_dr" && (rule[[2]]$g == 1 || rule[[2]]$g == rule[[2]]$ploidy - 1)) {
    gam[[2]]$type <- "mix"
    gam[[2]]$add_dr <- TRUE
    gam[[2]]$gamma <- 1
    if (is.null(rule[[2]]$beta)) {
      gam[[2]]$beta <- par[[cindex]]
      cindex <- cindex + 1
    } else {
      gam[[2]]$beta <- rule[[2]]$beta
    }
  } else if (rule[[2]]$type == "mix" && (rule[[2]]$g == 1 || rule[[2]]$g == rule[[2]]$ploidy - 1)) {
    gam[[2]]$type <- "mix"
    gam[[2]]$gamma <- 1
    gam[[2]]$add_dr <- FALSE
  } else if ((rule[[2]]$type == "mix" || rule[[2]]$type == "mix_dr")) {
    gam[[2]]$type <- "mix"
    gam[[2]]$add_dr <- FALSE
    if (rule[[2]]$g != 0 && rule[[2]]$g != rule[[2]]$ploidy) {
      if (is.null(rule[[2]]$gamma)) {
        npp <- n_pp_mix(g = rule[[2]]$g, ploidy = rule[[2]]$ploidy)
        if (gamma_type == "real") {
          gam[[2]]$gamma <- real_to_simplex(par[cindex:(cindex + npp - 2)])
          cindex <- cindex + npp - 1
        } else if (gamma_type == "simplex") {
          gam[[2]]$gamma <- par[cindex:(cindex + npp - 1)]
          cindex <- cindex + npp
        }
      } else {
        gam[[2]]$gamma <- rule[[2]]$gamma
      }
    }
  } else {
    stop("par_to_gam 2: how did you get here?")
  }

  if (rule[[3]]$outlier) {
    if (is.null(rule[[3]]$pi)) {
      gam[[3]]$pi <- par[[cindex]]
      cindex <- cindex + 1
    } else {
      gam[[3]]$pi <- rule[[3]]$pi
    }
  }

  return(gam)
}

#' Par to genotype frequencies
#'
#' @inheritParams par_to_gam
#' @param nudge how much to move above 0.
#' @param gamma_type Is gamma on real transformed on simplex?
#'
#' @return The vector of genotype frequencies.
#'
#' @author David Gerard
#'
#' @examples
#' rule <- list(
#'   list(ploidy = 4, g = 2, type = "mix"),
#'   list(ploidy = 8, g = 4, type = "polysomic"),
#'   list(outlier = TRUE)
#'   )
#' par <- c(-2, 0.1, 0.2, 0.03)
#' par_to_gf(par = par, rule = rule)
#'
#' rule <- list(
#'   list(ploidy = 4, g = 0, type = "mix"),
#'   list(ploidy = 8, g = 8, type = "polysomic"),
#'   list(outlier = FALSE)
#'   )
#' par <- c()
#' par_to_gf(par = par, rule = rule)
#'
#' @noRd
par_to_gf <- function(par, rule, nudge = sqrt(.Machine$double.eps), gamma_type = c("real", "simplex")) {
  gamma_type <- match.arg(gamma_type)
  gampar <- par_to_gam(par = par, rule = rule, gamma_type = gamma_type)
  p1 <- do.call(what = gamfreq, args = gampar[[1]])
  p2 <- do.call(what = gamfreq, args = gampar[[2]])
  q <- convolve_2(p1 = p1, p2 = p2, nudge = nudge)
  if (gampar[[3]]$outlier) {
    q <- q * (1 - gampar[[3]]$pi) + gampar[[3]]$pi / length(q)
  }
  return(q)
}


#' Inverse function of par_to_gam()
#'
#' @param gam A list of length 3. The first contains the parameters from
#'     gamfreq() for parent 1, the second contains the parameters from
#'     gamfreq() of parent 2. The third contains the outlier proportion.
#'     The parameters in the parent lists are
#'     \describe{
#'       \item{\code{ploidy}}{Ploidy of the parent}
#'       \item{\code{g}}{Genotype of the parent}
#'       \item{\code{gamma}}{Preferential pairing parameter of parent}
#'       \item{\code{beta}}{Double reduction adjustment at simplex markers}
#'       \item{\code{alpha}}{Double reduction rates}
#'       \item{\code{type}}{Either \code{"mix"} or \code{"polysomic"}.}
#'       \item{\code{add_dr}}{Logical. Under "mix" should we adjust for double reduction at simplex loci?}
#'     }
#'     The third list contains the following elements
#'     \describe{
#'       \item{\code{outlier}}{A logical. Should we account for outliers?}
#'       \item{\code{pi}}{The outlier proportion.}
#'     }
#' @param fix_list A list of length three, determining which elements are fixed
#'    (not part of \code{par}). They are all logicals. Each list element corresponds
#'    to a list element in \code{gam}. E.g. \code{fix_list[[1]]$alpha = TRUE}
#'    would mean that \code{gam[[1]]$alpha} is fixed (not part of \code{par}).
#'    This can be NULL if not elements are fixed.
#' @param db Bound on double reduction rate(s). Should we use the CES or the
#'    PRCS model for the upper bounds of the double reduction rate.
#' @param ob Upper bound on the outlier proportion.
#' @param gamma_type If \code{gamma_type = "real"}, then it assumes that the
#'    gamma values are transformed on the real line. Otherwise, it assumes
#'    the gamma values are on the simplex.
#'
#'
#' @return A list of length 4, containing \code{par}, \code{rule},
#'     \code{lower}, and \code{upper}. See \code{\link{par_to_gam}()}
#'     for a description of \code{par} and \code{rule}. The \code{lower}
#'     and \code{upper} elements are the bounds on \code{par}. \code{name}
#'     tells you what each element of par corresponds to.
#'
#' @seealso \code{\link{gamfreq}()} for more details on the parent parameters.
#'
#' @author David Gerard
#'
#' @noRd
#'
#' @examples
#' gam <- list()
#' gam[[1]] <- list(
#'   ploidy = 4,
#'   g = 1,
#'   alpha = 1/6,
#'   beta = NULL,
#'   gamma = NULL,
#'   type = "polysomic",
#'   add_dr = FALSE)
#' gam[[2]] <- list(
#'   ploidy = 8,
#'   g = 4,
#'   alpha = NULL,
#'   beta = NULL,
#'   gamma = c(0.1, 0.8, 0.1),
#'   type = "mix",
#'   add_dr = FALSE)
#' gam[[3]] <- list(
#'   outlier = TRUE,
#'   pi = 0.03
#' )
#' gam_to_par(gam)
gam_to_par <- function(
    gam,
    fix_list = NULL,
    db = c("ces", "prcs"),
    ob = 0.03,
    gamma_type = c("real", "simplex")) {
  ret <- list()
  ret$par <- c()
  ret$rule <- list()
  ret$lower <- c()
  ret$upper <- c()
  ret$name <- c()
  db <- match.arg(db)
  gamma_type <- match.arg(gamma_type)

  ## prepopulate rule
  ret$rule[[1]] <- list(
    ploidy = gam[[1]]$ploidy,
    g = gam[[1]]$g,
    type = NULL
  )
  ret$rule[[2]] <- list(
    ploidy = gam[[2]]$ploidy,
    g = gam[[2]]$g,
    type = NULL
  )
  ret$rule[[3]] <- list(
    outlier = NULL
  )

  if (is.null(fix_list)) {
    fix_list <- list(
      list(
        alpha = FALSE,
        beta = FALSE,
        gamma = FALSE
      ),
      list(
        alpha = FALSE,
        beta = FALSE,
        gamma = FALSE
      ),
      list(
        pi = FALSE
      )
    )
  }

  ## smallest and largest values for transformed gamma parameter
  lower_val <- -20
  upper_val <- 20
  TOL <- pkg_env$TOL_small

  ## parent 1
  if (gam[[1]]$g == 0 || gam[[1]]$g == gam[[1]]$ploidy) {
    ret$rule[[1]]$type <- gam[[1]]$type
  } else if (gam[[1]]$type == "polysomic" && gam[[1]]$g != 0 && gam[[1]]$g != gam[[1]]$ploidy) {
    if (!fix_list[[1]]$alpha) {
      nd <- floor(gam[[1]]$ploidy / 4)
      stopifnot(length(gam[[1]]$alpha) == nd)
      ret$par <- c(ret$par, gam[[1]]$alpha)
      ret$lower <- c(ret$lower, rep(x = TOL, times = nd))
      ret$upper <- c(ret$upper, drbounds(ploidy = gam[[1]]$ploidy, model = db))
      ret$name <- c(ret$name, rep(x = "alpha_1", times = nd))
    } else {
      ret$rule[[1]]$alpha <- gam[[1]]$alpha
    }
    ret$rule[[1]]$type <- "polysomic"
  } else if (gam[[1]]$type == "mix" && gam[[1]]$add_dr && (gam[[1]]$g == 1 || gam[[1]]$g == gam[[1]]$ploidy - 1)) {
    if (!fix_list[[1]]$beta) {
      stopifnot(length(gam[[1]]$beta) == 1)
      ret$par <- c(ret$par, gam[[1]]$beta)
      ret$lower <- c(ret$lower, TOL)
      ret$upper <- c(ret$upper, beta_bounds(ploidy = gam[[1]]$ploidy, model = db))
      ret$name <- c(ret$name, "beta_1")
    } else {
      ret$rule[[1]]$beta <- gam[[1]]$beta
    }
    ret$rule[[1]]$type <- "mix_dr"
  } else if (gam[[1]]$type == "mix" && !gam[[1]]$add_dr && (gam[[1]]$g == 1 || gam[[1]]$g == gam[[1]]$ploidy - 1)) {
    ret$rule[[1]]$type <- "mix"
    ret$rule[[1]]$beta <- 0
  } else if (gam[[1]]$type == "mix" && gam[[1]]$g > 1 && gam[[1]]$g < gam[[1]]$ploidy - 1) {
    if (!fix_list[[1]]$gamma) {
      npp <- n_pp_mix(g = gam[[1]]$g, ploidy = gam[[1]]$ploidy)
      stopifnot(length(gam[[1]]$gamma) == npp)
      if (gamma_type == "real") {
        ret$par <- c(ret$par, simplex_to_real(q = gam[[1]]$gamma))
        ret$lower <- c(ret$lower, rep(x = lower_val, times = npp - 1))
        ret$upper <- c(ret$upper, rep(x = upper_val, times = npp - 1))
        ret$name <- c(ret$name, rep(x = "gamma_1", times = npp - 1))
      } else if (gamma_type == "simplex") {
        ret$par <- c(ret$par, gam[[1]]$gamma)
        ret$lower <- c(ret$lower, rep(x = TOL, times = npp))
        ret$upper <- c(ret$upper, rep(x = 1 - TOL, times = npp))
        ret$name <- c(ret$name, rep(x = "gamma_1", times = npp))
      }
    } else {
      ret$rule[[1]]$gamma <- gam[[1]]$gamma
    }
    ret$rule[[1]]$type <- "mix"
  }

  ## parent 2
  if (gam[[2]]$g == 0 || gam[[2]]$g == gam[[2]]$ploidy) {
    ret$rule[[2]]$type <- gam[[2]]$type
  } else if (gam[[2]]$type == "polysomic" && gam[[2]]$g != 0 && gam[[2]]$g != gam[[2]]$ploidy) {
    if (!fix_list[[2]]$alpha) {
      nd <- floor(gam[[2]]$ploidy / 4)
      stopifnot(length(gam[[2]]$alpha) == nd)
      ret$par <- c(ret$par, gam[[2]]$alpha)
      ret$lower <- c(ret$lower, rep(x = TOL, times = nd))
      ret$upper <- c(ret$upper, drbounds(ploidy = gam[[2]]$ploidy, model = db))
      ret$name <- c(ret$name, rep(x = "alpha_2", times = nd))
    } else {
      ret$rule[[2]]$alpha <- gam[[2]]$alpha
    }
    ret$rule[[2]]$type <- "polysomic"
  } else if (gam[[2]]$type == "mix" && gam[[2]]$add_dr && (gam[[2]]$g == 1 || gam[[2]]$g == gam[[2]]$ploidy - 1)) {
    if (!fix_list[[2]]$beta) {
      stopifnot(length(gam[[2]]$beta) == 1)
      ret$par <- c(ret$par, gam[[2]]$beta)
      ret$lower <- c(ret$lower, TOL)
      ret$upper <- c(ret$upper, beta_bounds(ploidy = gam[[2]]$ploidy, model = db))
      ret$name <- c(ret$name, "beta_2")
    } else {
      ret$rule[[2]]$beta <- gam[[2]]$beta
    }
    ret$rule[[2]]$type <- "mix_dr"
  } else if (gam[[2]]$type == "mix" && !gam[[2]]$add_dr && (gam[[2]]$g == 1 || gam[[2]]$g == gam[[2]]$ploidy - 1)) {
    ret$rule[[2]]$type <- "mix"
    ret$rule[[2]]$beta <- 0
  } else if (gam[[2]]$type == "mix" && gam[[2]]$g > 1 && gam[[2]]$g < gam[[2]]$ploidy - 1) {
    if (!fix_list[[2]]$gamma) {
      npp <- n_pp_mix(g = gam[[2]]$g, ploidy = gam[[2]]$ploidy)
      stopifnot(length(gam[[2]]$gamma) == npp)
      if (gamma_type == "real") {
        ret$par <- c(ret$par, simplex_to_real(q = gam[[2]]$gamma))
        ret$lower <- c(ret$lower, rep(x = lower_val, times = npp - 1))
        ret$upper <- c(ret$upper, rep(x = upper_val, times = npp - 1))
        ret$name <- c(ret$name, rep(x = "gamma_2", times = npp - 1))
      } else if (gamma_type == "simplex") {
        ret$par <- c(ret$par, gam[[2]]$gamma)
        ret$lower <- c(ret$lower, rep(x = TOL, times = npp))
        ret$upper <- c(ret$upper, rep(x = 1 - TOL, times = npp))
        ret$name <- c(ret$name, rep(x = "gamma_2", times = npp))
      }
    } else {
      ret$rule[[2]]$gamma <- gam[[2]]$gamma
    }
    ret$rule[[2]]$type <- "mix"
  }

  ## outlier
  if (gam[[3]]$outlier) {
    if(!fix_list[[3]]$pi) {
      stopifnot(length(gam[[3]]$pi) == 1)
      ret$par <- c(ret$par, gam[[3]]$pi)
      ret$lower <- c(ret$lower, TOL)
      ret$upper <- c(ret$upper, ob)
      ret$name <- c(ret$name, "pi")
    } else {
      ret$rule[[3]]$pi <- gam[[3]]$pi
    }
    ret$rule[[3]]$outlier <- TRUE
  } else {
    ret$rule[[3]]$outlier <- FALSE
  }

  return(ret)
}


#' (log) likelihood when genotypes are known
#'
#' @inheritParams par_to_gam
#' @param x vector of counts of offspring genotypes
#' @param neg Should we return the negative log likelihood (TRUE) or the log-likelihood (FALSE)?
#'
#' @author David Gerard
#'
#' @noRd
ll_g <- function(par, rule, x, neg = FALSE) {
  gf <- par_to_gf(par = par, rule = rule)
  ll <- stats::dmultinom(x = x, prob = gf, log = TRUE)
  if (neg) {
    ll <- -1 * ll
  }
  return(ll)
}

#' log likelihood when genotypes are not known
#'
#' @inheritParams par_to_gam
#' @param x A matrix of genotype log-likelihoods. The rows index the
#'     individuals and the columns index the genotypes.
#' @param neg Should we return the negative log likelihood (TRUE) or the log-likelihood (FALSE)?
#'
#' @author David Gerard
#'
#' @noRd
ll_gl <- function(par, rule, x, neg = FALSE) {
  gf <- par_to_gf(par = par, rule = rule)
  ll <- llike_li(B = x, lpivec = log(gf))
  if (neg) {
    ll <- -1 * ll
  }
  return(ll)
}

#' @title Test for segregation distortion in a polyploid F1 population.
#'
#' @description
#' Provides tests for segregation distortion for an F1 population of polyploids
#' under various models of meiosis. You can use this test for autopolyploids that exhibit
#' full polysomic inheritance, allopolyploids that exhibit full
#' disomic inheritance, or segmental allopolyploids that exhibit
#' partial preferential pairing. Double reduction is (optionally) fully accounted
#' for in tetraploids, and (optionally) partially accounted for (only at simplex
#' loci) for higher ploidies. Some maximum proportion of outliers can be
#' specified (default at 3%), and so this method can accommodate moderate
#' levels of double reduction at non-simplex loci. Offspring genotypes can
#' either be known, or genotype uncertainty can be represented through
#' genotype likelihoods. Parent data may or may not be provided, at your
#' option. Parents can have different (even) ploidies, at your option.
#' Details of the methods may be found in Gerard et al. (2025).
#'
#' @section Null Model:
#' The gamete frequencies under the null model can be calculated via
#' \code{\link{gamfreq}()}. The genotype frequencies, which are just
#' a discrete linear convolution (\code{\link[stats]{convolve}()}) of the
#' gamete frequencies, can be calculated via \code{\link{gf_freq}()}.
#'
#' The null model's gamete frequencies for true autopolyploids
#' (\code{model = "auto"}) or
#' true allopolyploids (\code{model = "allo"}) are given in the \code{\link{seg}} data frame
#' that comes with this package. I only made that data frame go up to
#' ploidy 20, but let me know if you need it for higher ploidies.
#'
#' The polyRAD folks test for full autopolyploid and full allopolyploid, so I
#' included that as an option (\code{model = "auto_allo"}).
#'
#' We can account for arbitrary levels of double reduction in autopolyploids
#' (\code{model = "auto_dr"}) using the gamete frequencies from
#' Huang et al (2019).
#'
#' The null model for segmental allopolyploids (\code{model = "allo_pp"}) is the mixture model of
#' the possible allopolyploid gamete frequencies. The autopolyploid model
#' (without double reduction) is a subset of this mixture model.
#'
#' In the above mixture model, we can account for double reduction for simplex
#' loci (\code{model = "seg"}) by just slightly reducing the
#' number of simplex gametes and increasing the number of duplex and
#' nullplex gametes. That is, the frequencies for (nullplex, simplex, duplex)
#' gametes go from \code{(0.5, 0.5, 0)} to
#' \code{(0.5 + b, 0.5 - 2 * b, b)}.
#'
#' \code{model = "seg"} is the most general, so it is the default. But you
#' should use other models if you have more information on your species. E.g.
#' if you know you have an autopolyploid, use either \code{model = "auto"}
#' or \code{model = "auto_dr"}.
#'
#' @section Unidentified Parameters:
#' Do NOT interpret the estimated parameters in the \code{null$gam} list.
#' These parameters are weakly identified (I had to do some fancy
#' spectral methods to account for this in the null distribution
#' of the tests). Even though they are technically identified, you would
#' need a massive data set to be able to estimate them accurately.
#'
#' @param x The data. Can be one of two forms:
#'   \itemize{
#'     \item{A vector of genotype counts. This is when offspring genotypes are known.}
#'     \item{A matrix of genotype log-likelihoods. This is when there is genotype uncertainty.
#'           The rows index the individuals and the columns index the possible genotypes.
#'           The genotype log-likelihoods should be base e (natural log).}
#'   }
#' @param p1_ploidy,p2_ploidy The ploidy of the first or second parent. Should be even.
#' @param p1,p2 One of three forms:
#'   \itemize{
#'     \item{The known genotype of the first or second parent.}
#'     \item{The vector of genotype log-likelihoods of the first or second parent. Should be base e (natural log).}
#'     \item{\code{NULL} (completely unknown)}
#'   }
#' @param model One of six forms:
#'   \describe{
#'     \item{\code{"seg"}}{Segmental allopolyploid. Allows for arbitrary levels of polysomic and disomic inheritance. This can account for partial preferential pairing. It also accounts for double reduction at simplex loci.}
#'     \item{\code{"auto"}}{Autopolyploid. Allows only for polysomic inheritance. No double reduction.}
#'     \item{\code{"auto_dr"}}{Autopolyploid, allowing for the effects of double reduction.}
#'     \item{\code{"allo"}}{Allopolyploid. Only complete disomic inheritance is explored.}
#'     \item{\code{"allo_pp"}}{Allopolyploid, allowing for the effects of partial preferential pairing. Though, autopolyploid (with complete bivalent pairing and no double reduction) is a special case of this model.}
#'     \item{\code{"auto_allo"}}{Only complete disomic and complete polysomic inheritance is studied.}
#'   }
#' @param outlier A logical. Should we allow for outliers (\code{TRUE}) or not (\code{FALSE})?
#' @param ret_out A logical. Should we return the probability that each individual is an outlier (\code{TRUE}) or not (\code{FALSE})?
#' @param ob The default upper bound on the outlier proportion.
#' @param db Should we use the complete equational segregation model (\code{"ces"}) or
#'    the pure random chromatid segregation model (\code{"prcs"}) to determine the upper
#'    bound(s) on the double reduction rate(s). See \code{\link{drbounds}()}
#'    for details.
#' @param ntry The number of times to try the optimization.
#'    You probably do not want to touch this.
#' @param opt For local optimization, should we use bobyqa (Powell, 2009) or L-BFGS-B (Byrd et al, 1995)? You probably do not want to touch this.
#' @param optg Initial global optimization used to start local optimization. Methods are described in the \href{https://nlopt.readthedocs.io/en/latest/Citing_NLopt/}{\code{nloptr}} package (Johnson, 2008). You probably do not want to touch this. Possible values are:
#'   \describe{
#'     \item{\code{"NLOPT_GN_MLSL_LDS"}}{MLSL (Multi-Level Single-Linkage). Kucherenko and Sytsko (2005)}
#'     \item{\code{"NLOPT_GN_ESCH"}}{ESCH (evolutionary algorithm). da Silva Santos et al. (2010)}
#'     \item{\code{"NLOPT_GN_CRS2_LM"}}{Controlled Random Search (CRS) with local mutation. Kaelo and Ali (2006)}
#'     \item{\code{"NLOPT_GN_ISRES"}}{ISRES (Improved Stochastic Ranking Evolution Strategy). Runarsson and Yao (2005)}
#'   }
#' @param df_tol Threshold for the rank of the Jacobian for the degrees of
#'    freedom calculation. This accounts for weak identifiability in the
#'    null model. You probably do not want to touch this.
#' @param chisq A logical. When using known genotypes, this flags to use
#'    the chi-squared test or the Likelihood Ratio Test. Default is \code{FALSE}
#'    for the likelihood ratio test.
#'
#' @return A list with some or all of the following elements
#' \describe{
#'   \item{\code{stat}}{The test statistic.}
#'   \item{\code{df}}{The degrees of freedom of the test.}
#'   \item{\code{p_value}}{The p-value of the test.}
#'   \item{\code{null_bic}}{The null model's BIC.}
#'   \item{\code{outprob}}{Outlier probabilities. Only returned in \code{ret_out = TRUE}.
#'     \itemize{
#'       \item{If using genotype counts, element \code{i} is the probability that an individual \emph{with genotype} \code{i-1} is an outlier. So the return vector has length ploidy plus 1.}
#'       \item{If using genotype log-likelihoods, element \code{i} is the probability that individual \code{i} is an outlier. So the return vector has the same length as the number of individuals.}
#'       }
#'     These outlier probabilities are only valid if the null of no segregation is true.
#'    }
#'   \item{\code{null}}{A list with estimates and information on the null model.
#'   \describe{
#'     \item{\code{l0_pp}}{Maximized likelihood under the null plus the parent log-likelihoods.}
#'     \item{\code{l0}}{Maximized likelihood under using estimated parent genotypes are known parent genotypes.}
#'     \item{\code{q0}}{Estimated genotype frequencies under the null.}
#'     \item{\code{df0}}{Estimated number of parameters under the null.}
#'     \item{\code{gam}}{A list of three lists with estimates of the model
#'       parameters. The third list contains the elements \code{outlier}
#'       (which is \code{TRUE} if outliers were modeled) and \code{pi}
#'       (the estimated outlier proportion). The first two lists contain
#'       information on each parent with the following elements:
#'         \describe{
#'           \item{\code{ploidy}}{The ploidy of the parent.}
#'           \item{\code{g}}{The (estimated) genotype of the parent.}
#'           \item{\code{alpha}}{The estimated double reduction rate(s). \code{alpha[i]} is the estimated probability that a gamete has \code{i} copies of identical by double reduction alleles.}
#'           \item{\code{beta}}{Double reduction's effect on simplex loci when \code{type = "mix"} and \code{add_dr = TRUE}.}
#'           \item{\code{gamma}}{The mixing proportions for the pairing configurations. The order is the same as in \code{\link{seg}}.}
#'           \item{\code{type}}{Either \code{"mix"} or \code{"polysomic"}}
#'           \item{\code{add_dr}}{Did we model double reduction at simplex loci when using \code{type = "mix"} (\code{TRUE}) or not (\code{FALSE})?}
#'         }
#'       }
#'     }
#'   }
#'   \item{\code{alt}}{A list with estimates and information on the alternative model.
#'     \describe{
#'       \item{\code{l1}}{The maximized likelihood under the alternative.}
#'       \item{\code{q1}}{The estimated genotype frequencies under the alternative.}
#'       \item{\code{df1}}{The estimated number of parameters under the alternative.}
#'     }
#'   }
#' }
#'
#' @author David Gerard
#'
#' @references
#' \itemize{
#'   \item{Byrd, R. H., Lu, P., Nocedal, J., & Zhu, C. (1995). A limited memory algorithm for bound constrained optimization. \emph{SIAM Journal on scientific computing}, 16(5), 1190-1208. \doi{10.1137/0916069}}
#'   \item{da Silva Santos, C. H., Goncalves, M. S., & Hernandez-Figueroa, H. E. (2010). Designing novel photonic devices by bio-inspired computing. \emph{IEEE Photonics Technology Letters}, 22(15), 1177-1179. \doi{10.1109/LPT.2010.2051222}}
#'   \item{Gerard, D, Ambrosano, GB, Pereira, GdS, & Garcia, AAF (2025). Tests for segregation distortion in higher ploidy F1 populations. \emph{bioRxiv}, p. 1-20. \doi{10.1101/2025.06.23.661114}}
#'   \item{Huang, K., Wang, T., Dunn, D. W., Zhang, P., Cao, X., Liu, R., & Li, B. (2019). Genotypic frequencies at equilibrium for polysomic inheritance under double-reduction. \emph{G3: Genes, Genomes, Genetics}, 9(5), 1693-1706. \doi{10.1534/g3.119.400132}}
#'   \item{Johnson S (2008). The NLopt nonlinear-optimization package. \url{https://github.com/stevengj/nlopt}.}
#'   \item{Kaelo, P., & Ali, M. M. (2006). Some variants of the controlled random search algorithm for global optimization. \emph{Journal of optimization theory and applications}, 130, 253-264. \doi{10.1007/s10957-006-9101-0}}
#'   \item{Kucherenko, S., & Sytsko, Y. (2005). Application of deterministic low-discrepancy sequences in global optimization. \emph{Computational Optimization and Applications}, 30, 297-318. \doi{10.1007/s10589-005-4615-1}}
#'   \item{Powell, M. J. D. (2009), The BOBYQA algorithm for bound constrained optimization without derivatives, Report No. DAMTP 2009/NA06, Centre for Mathematical Sciences, University of Cambridge, UK.}
#'   \item{Runarsson, T. P., & Yao, X. (2005). Search biases in constrained evolutionary optimization. \emph{IEEE Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews)}, 35(2), 233-243. \doi{https://doi.org/10.1109/TSMCC.2004.841906}}
#' }
#'
#' @examples
#' set.seed(1)
#' p1_ploidy <- 4
#' p1 <- 1
#' p2_ploidy <- 8
#' p2 <- 4
#' q <- gf_freq(
#'   p1_g = p1,
#'   p1_ploidy = p1_ploidy,
#'   p1_gamma = 1,
#'   p1_type = "mix",
#'   p2_g = p2,
#'   p2_ploidy = p2_ploidy,
#'   p2_gamma= c(0.2, 0.2, 0.6),
#'   p2_type = "mix",
#'   pi = 0.01)
#' nvec <- c(stats::rmultinom(n = 1, size = 200, prob = q))
#' gl <- simgl(nvec = nvec)
#' seg_lrt(x = nvec, p1_ploidy = p1_ploidy, p2_ploidy = p2_ploidy, p1 = p1, p2 = p2)$p_value
#' seg_lrt(x = gl, p1_ploidy = p1_ploidy, p2_ploidy = p2_ploidy, p1 = p1, p2 = p2)$p_value
#'
#' @export
seg_lrt <- function(
    x,
    p1_ploidy,
    p2_ploidy = p1_ploidy,
    p1 = NULL,
    p2 = NULL,
    model = c("seg", "auto", "auto_dr", "allo", "allo_pp", "auto_allo"),
    outlier = TRUE,
    ret_out = FALSE,
    ob = 0.03,
    db = c("ces", "prcs"),
    ntry = 3,
    opt = c("bobyqa", "L-BFGS-B"),
    optg = c("NLOPT_GN_MLSL_LDS", "NLOPT_GN_ESCH", "NLOPT_GN_CRS2_LM", "NLOPT_GN_ISRES"),
    df_tol = 1e-3,
    chisq = FALSE) {

  ## Check input ----------
  opt <- match.arg(opt)
  optg <- match.arg(optg) ## "NLOPT_GN_DIRECT_L" works bad

  model <- match.arg(model)
  db <- match.arg(db)
  stopifnot(ntry > 0, length(ntry) == 1)
  stopifnot(df_tol >= 0, length(df_tol) == 1)
  stopifnot(is.logical(chisq), length(chisq) == 1)
  stopifnot(is.logical(ret_out), length(ret_out) == 1)
  stopifnot(
    p1_ploidy %% 2 == 0, p1_ploidy > 1,
    p2_ploidy %% 2 == 0, p2_ploidy > 1)
  ## Check if genotype counts or genotype log likelihoods.
  if (is.matrix(x)) {
    stopifnot(ncol(x) == (p1_ploidy + p2_ploidy) / 2 + 1)
    data_type <- "glike"
    nind <- nrow(x)
    ll_now <- ll_gl
    if (any(is.na(x))) {
      narow <- apply(x, 1, function(y) all(is.na(y)))
      x[narow, ] <- 0
      if (any(is.na(x))) {
        stop("seg_lrt(): There are some individuals with some (but not all) genotype likelihoods missing. For an individual, only all or none can be missing.")
      }
    }
  } else {
    stopifnot(length(x) == (p1_ploidy + p2_ploidy) / 2 + 1, x >= 0)
    data_type <- "gcount"
    if (sum(x != 0) == 1) {
      opt <- "L-BFGS-B" ## bobyqa doesn't work for this scenario
    }
    nind <- sum(x)
    ll_now <- ll_g
  }
  ## Get p1 data type. p1_pos is possible dosages, p1 is genotype log-likelihoods
  if (is.null(p1)) {
    p1 <- rep(0, times = p1_ploidy + 1)
    p1_pos <- 0:p1_ploidy
  } else if (length(p1) == 1) {
    stopifnot(p1 >= 0, p1 <= p1_ploidy, length(p1) == 1)
    p1_pos <- p1
    p1 <- rep(-Inf, times = p1_ploidy + 1)
    p1[p1_pos + 1] <- 0
  } else {
    stopifnot(length(p1) == p1_ploidy + 1)
    p1_pos <- 0:p1_ploidy
  }
  ## Get p2 data type. p2_pos is possible dosages, p2 is genotype log-likelihoods
  if (is.null(p2)) {
    p2 <- rep(0, times = p2_ploidy + 1)
    p2_pos <- 0:p2_ploidy
  } else if (length(p2) == 1) {
    stopifnot(p2 >= 0, p2 <= p2_ploidy, length(p2) == 1)
    p2_pos <- p2
    p2 <- rep(-Inf, times = p2_ploidy + 1)
    p2[p2_pos + 1] <- 0
  } else {
    stopifnot(length(p2) == p2_ploidy + 1)
    p2_pos <- 0:p2_ploidy
  }
  ## check outlier inputs
  stopifnot(length(outlier) == 1, is.logical(outlier))
  stopifnot(length(ob) == 1, ob >= 0, ob <= 1)
  if (ob < pkg_env$TOL_small) {
    outlier <- FALSE
  }

  ## Set up gam
  gam <- list()
  gam[[1]] <- list(
    ploidy = p1_ploidy,
    g = NULL,
    alpha = NULL,
    beta = NULL,
    gamma = NULL,
    type = NULL,
    add_dr = NULL)
  gam[[2]] <- list(
    ploidy = p2_ploidy,
    g = NULL,
    alpha = NULL,
    beta = NULL,
    gamma = NULL,
    type = NULL,
    add_dr = NULL)
  gam[[3]] <- list(
    outlier = outlier,
    pi = NULL
  )
  if (model == "seg") {
    gam[[1]]$type <- "mix"
    gam[[1]]$add_dr <- TRUE
    gam[[2]]$type <- "mix"
    gam[[2]]$add_dr <- TRUE
  } else if (model == "auto") {
    gam[[1]]$type <- "polysomic"
    gam[[1]]$add_dr <- FALSE
    gam[[2]]$type <- "polysomic"
    gam[[2]]$add_dr <- FALSE
  } else if (model == "auto_dr") {
    gam[[1]]$type <- "polysomic"
    gam[[1]]$add_dr <- TRUE
    gam[[2]]$type <- "polysomic"
    gam[[2]]$add_dr <- TRUE
  } else if (model == "allo" || model == "allo_pp" || model == "auto_allo") {
    gam[[1]]$type <- "mix"
    gam[[1]]$add_dr <- FALSE
    gam[[2]]$type <- "mix"
    gam[[2]]$add_dr <- FALSE
  }

  ## Alternative optimization ----
  if (data_type == "glike") {
    log_qhat1 <- c(em_li(B = x))
    qhat1 <- exp(log_qhat1)
    l1 <- llike_li(B = x, lpivec = log_qhat1)
  } else if (data_type == "gcount") {
    qhat1 <- x / sum(x)
    l1 <- stats::dmultinom(x = x, prob = x / sum(x), log = TRUE)
  }
  alt_best <- list(
    l1 = l1,
    q1 = qhat1
  )

  ## Null optimization ----
  if (model == "seg" || model == "allo_pp" || model == "auto" || model == "auto_dr") {
    fix_list <- list(
      list(
        alpha = FALSE,
        beta = FALSE,
        gamma = FALSE
      ),
      list(
        alpha = FALSE,
        beta = FALSE,
        gamma = FALSE
      ),
      list(
        pi = FALSE
      )
    )

    if (model == "auto") {
      fix_list[[1]]$alpha <- TRUE
      fix_list[[2]]$alpha <- TRUE
      gam[[1]]$alpha <- rep(x = 0, times = floor(gam[[1]]$ploidy / 4))
      gam[[2]]$alpha <- rep(x = 0, times = floor(gam[[2]]$ploidy / 4))
    }

    null_best <- list()
    null_best$l0_pp <- -Inf
    for (p1_geno in p1_pos) {
      for (p2_geno in p2_pos) {
        gam[[1]]$g <- p1_geno
        gam[[2]]$g <- p2_geno
        nudge <- pkg_env$TOL_small ## bound for starting values
        for (i in seq_len(ntry)) {
          ## Do parent 1
          if (model == "auto_dr") {
            ndr <- floor(gam[[1]]$ploidy / 4)
            gam[[1]]$alpha <- stats::runif(n = ndr, min = rep(x = nudge, times = ndr), max = drbounds(ploidy = gam[[1]]$ploidy, model = db) - nudge)
          } else if (model == "seg") {
            if (p1_geno == 1 || p1_geno == p1_ploidy - 1) {
              gam[[1]]$gamma <- 1
              gam[[1]]$beta <- stats::runif(n = 1, min = nudge, max = beta_bounds(ploidy = p1_ploidy) - nudge)
            } else {
              nmix <- n_pp_mix(g = p1_geno, ploidy = p1_ploidy)
              gam[[1]]$gamma <- stats::runif(n = nmix)
              gam[[1]]$gamma <- gam[[1]]$gamma / sum(gam[[1]]$gamma)
            }
          } else if (model == "allo_pp") {
            if (p1_geno == 1 || p1_geno == p1_ploidy - 1) {
              gam[[1]]$gamma <- 1
            } else {
              nmix <- n_pp_mix(g = p1_geno, ploidy = p1_ploidy)
              gam[[1]]$gamma <- stats::runif(n = nmix)
              gam[[1]]$gamma <- gam[[1]]$gamma / sum(gam[[1]]$gamma)
            }
          }

          ## do parent 2
          if (model == "auto_dr") {
            ndr <- floor(gam[[2]]$ploidy / 4)
            gam[[2]]$alpha <- stats::runif(n = ndr, min = rep(x = nudge, times = ndr), max = drbounds(ploidy = gam[[2]]$ploidy, model = db) - nudge)
          } else if (model == "seg") {
            if (p2_geno == 1 || p2_geno == p2_ploidy - 1) {
              gam[[2]]$gamma <- 1
              gam[[2]]$beta <- stats::runif(n = 1, min = nudge, max = beta_bounds(ploidy = p2_ploidy) - nudge)
            } else {
              nmix <- n_pp_mix(g = p2_geno, ploidy = p2_ploidy)
              gam[[2]]$gamma <- stats::runif(n = nmix)
              gam[[2]]$gamma <- gam[[2]]$gamma / sum(gam[[2]]$gamma)
            }
          } else if (model == "allo_pp") {
            if (p2_geno == 1 || p2_geno == p2_ploidy - 1) {
              gam[[2]]$gamma <- 1
            } else {
              nmix <- n_pp_mix(g = p2_geno, ploidy = p2_ploidy)
              gam[[2]]$gamma <- stats::runif(n = nmix)
              gam[[2]]$gamma <- gam[[2]]$gamma / sum(gam[[2]]$gamma)
            }
          }

          if (gam[[3]]$outlier) {
            gam[[3]]$pi <- stats::runif(n = 1, min = 0, max = ob)
          }

          ret <- gam_to_par(gam = gam, fix_list = fix_list, db = db, ob = ob)

          ## account edge case where gamma is simulated to start beyond bounds
          par_bad <- (ret$par < ret$lower) | (ret$par > ret$upper)
          if (any(par_bad)) {
            ret$par[par_bad] <- stats::runif(n = sum(par_bad), min = ret$lower[par_bad], max = ret$upper[par_bad])
          }

          if (length(ret$par) == 0) {
            oout <- list(
              value = ll_now(par = NULL, rule = ret$rule, x = x, neg = FALSE),
              par = NULL
            )
          } else if (length(ret$par) == 1) {
            oout <- stats::optim(
              par = ret$par,
              fn = ll_now,
              method = "Brent",
              rule = ret$rule,
              x = x,
              control = list(fnscale = -1),
              lower = ret$lower,
              upper = ret$upper)
          } else if (opt == "bobyqa") {
            oout_globe <- nloptr::nloptr(
              x0 = ret$par,
              eval_f = ll_now,
              lb = ret$lower,
              ub = ret$upper,
              x = x,
              rule = ret$rule,
              neg = TRUE,
              opts = list(algorithm = optg, ftol_rel = 1e-04, local_opts = list(algorithm = "NLOPT_LN_BOBYQA", ftol_rel = 1e-04)))
            ## sometimes, nloptr returns solutions just beyond bounds, weird.
            if (any(oout_globe$solution > ret$upper)) {
              oout_globe$solution[oout_globe$solution > ret$upper] <- ret$upper[oout_globe$solution > ret$upper]
            }
            if (any(oout_globe$solution < ret$lower)) {
              oout_globe$solution[oout_globe$solution < ret$lower] <- ret$lower[oout_globe$solution < ret$lower]
            }
            ## minqa gives warnings when changing tuning parameters, which is not worrying.
            suppressWarnings(
              oout_bob <- minqa::bobyqa(
                par = oout_globe$solution,
                fn = ll_now,
                lower = ret$lower,
                upper = ret$upper,
                x = x,
                rule = ret$rule,
                neg = TRUE)
            )
            oout <- list(
              value = -1 * oout_bob$fval,
              par = oout_bob$par
            )
          } else {
            oout_globe <- nloptr::nloptr(
              x0 = ret$par,
              eval_f = ll_now,
              lb = ret$lower,
              ub = ret$upper,
              x = x,
              rule = ret$rule,
              neg = TRUE,
              opts = list(algorithm = optg, ftol_rel = 1e-04, local_opts = list(algorithm = "NLOPT_LN_BOBYQA", ftol_rel = 1e-04)))
            oout <- stats::optim(
              par = oout_globe$solution,
              fn = ll_now,
              method = "L-BFGS-B",
              rule = ret$rule,
              x = x,
              control = list(fnscale = -1),
              lower = ret$lower,
              upper = ret$upper)
          }

          if (oout$value + p1[[p1_geno + 1]] + p2[[p2_geno + 1]] > null_best$l0_pp) {
            null_best$l0_pp <- oout$value + p1[[p1_geno + 1]] + p2[[p2_geno + 1]]
            null_best$l0 <- oout$value
            null_best$q0 <- par_to_gf(par = oout$par, rule = ret$rule) ## ML genotype frequency
            null_best$gam <- par_to_gam(par = oout$par, rule = ret$rule) ## MLE's
            null_best$df0 <- gam_to_df(gam = null_best$gam, fix_list = fix_list, db = db, ob = ob, df_tol = df_tol)
          }
          if (length(ret$par) <= 1) {
            ## Brent is awesome.
            break
          }
        }
      }
    }
  } else if (model == "allo" || model == "auto_allo") {
    if (model == "allo") {
      pos_modes <- c("disomic", "both")
    } else if (model == "auto_allo") {
      pos_modes <- c("disomic", "both", "polysomic")
    }
    null_best <- list()
    null_best$l0_pp <- -Inf
    for (p1_geno in p1_pos) {
      p1_list <- segtest::seg[segtest::seg$ploidy == p1_ploidy & segtest::seg$g == p1_geno & segtest::seg$mode %in% pos_modes, ]$p
      for (p2_geno in p2_pos) {
        p2_list <- segtest::seg[segtest::seg$ploidy == p2_ploidy & segtest::seg$g == p2_geno & segtest::seg$mode %in% pos_modes, ]$p
        for (i in seq_along(p1_list)) {
          p1_freq <- p1_list[[i]]
          for (j in seq_along(p2_list)) {
            p2_freq <- p2_list[[j]]
            zyg_freq <- convolve_2(p1 = p1_freq, p2 = p2_freq)
            if (!outlier) {
              oout <- list()
              oout$value <- ll_ao(par = 0, q = zyg_freq, x = x)
              oout$q0 <- zyg_freq
              oout$df0 <- 0
            } else {
              oout <- stats::optim(
                par = ob / 2,
                fn = ll_ao,
                method = "Brent",
                lower = pkg_env$TOL_small,
                upper = ob,
                control = list(fnscale = -1),
                q = zyg_freq,
                x = x)
              oout$q0 <- (1 - oout$par) * zyg_freq + oout$par / length(zyg_freq)
              if (oout$par > pkg_env$TOL_big && oout$par < ob - pkg_env$TOL_big) {
                oout$df0 <- 1
              } else {
                oout$df0 <- 0
              }
            }
            ## see if we have a new best
            if (oout$value + p1[[p1_geno + 1]] + p2[[p2_geno + 1]] > null_best$l0_pp) {
              null_best$l0_pp <- oout$value + p1[[p1_geno + 1]] + p2[[p2_geno + 1]]
              null_best$l0 <- oout$value
              null_best$q0 <- oout$q0
              null_best$df0 <- oout$df0
            }
          }
        }
      }
    }
  } else {
    stop("seg_lrt: how did you get here?")
  }

  ## subtract off if both zero under null and alt, then subtract off one.
  # alt_best$df1 <- length(alt_best$q1) - 1
  alt_best$df1 <- length(alt_best$q1) - sum((alt_best$q1 < pkg_env$TOL_big) & (null_best$q0 < pkg_env$TOL_big)) - 1

  ## chi-sauared test option
  if (chisq && data_type == "gcount") {
    ecounts <- null_best$q0 * sum(x)
    which_nonzero <- null_best$q0 > pkg_env$TOL_big
    stat_final <- sum((ecounts[which_nonzero] - x[which_nonzero])^2 / ecounts[which_nonzero])
  } else {
    stat_final <- 2 * (alt_best$l1 - null_best$l0)
  }

  ## Run test
  ret <- list(
    null = null_best,
    alt = alt_best,
    stat = stat_final,
    df = max(alt_best$df1 - null_best$df0, 1),
    null_bic = -2 * null_best$l0 + null_best$df0 * log(nind)
  )
  ret$p_value <- stats::pchisq(q = ret$stat, df = ret$df, lower.tail = FALSE)

  ## add outliers
  if (ret_out) {
    ret$outprob <- outprob(g = x, gam = null_best$gam)
  }

  return(ret)
}

#' Number of null parameters under auto_dr model
#'
#' Counts the dimenson of the inner parameters
#'
#' @inheritParams gam_to_par
#' @param df_tol Multiply the largest singular value by `df_tol` to get the
#'    lower bound threshold for numeric rank estimation of the Jacobian for
#'    df calculation.
#'
#' @author David Gerard
#'
#' @noRd
gam_to_df <- function(gam, fix_list = NULL, db = c("ces", "prcs"), ob = 0.03, df_tol = 1e-3) {
  db <- match.arg(db)
  TOL <- pkg_env$TOL_big
  if (is.null(fix_list)) {
    fix_list <- list(
      list(
        alpha = FALSE,
        beta = FALSE,
        gamma = FALSE
      ),
      list(
        alpha = FALSE,
        beta = FALSE,
        gamma = FALSE
      ),
      list(
        pi = FALSE
      )
    )
  }

  ret <- gam_to_par(gam = gam, fix_list = fix_list, db = db, ob = ob, gamma_type = "simplex")
  onbound <- (ret$par < ret$lower + TOL) | (ret$par > ret$upper - TOL)
  if (sum(!onbound) == 0) {
    return(0)
  }
  par_inner <- ret$par[!onbound]
  par_edge <-ret$par[onbound]
  fn <- function(par_inner, par_edge, onbound, rule) {
    par <- rep(NA_real_, times = length(onbound))
    par[!onbound] <- par_inner
    par[onbound] <- par_edge
    par_to_gf(par = par, rule = rule, gamma_type = "simplex")
  }

  env <- new.env()
  env$par_inner <- par_inner
  env$par_edge <- par_edge
  env$onbound <- onbound
  env$rule <- ret$rule
  dout <- stats::numericDeriv(expr = quote(fn(par_inner = par_inner, par_edge = par_edge, onbound = onbound, rule = rule)), theta = "par_inner", rho = env)
  dvec <- svd(attr(dout, "gradient"))$d
  ## heuristic: Largest SV times small value (e.g. 1/1000) for rank. Works well in practice.
  nparam <- sum(dvec > dvec[[1]] * df_tol)
  return(nparam)
}

#' Log-likelihood when we add the outlier proportion.
#'
#' @param par The outlier proportion
#' @param q The genotype frequencies without the outliers
#' @param x The data (either a vector of counts or a matrix of log-likelihoods)
#'
#' @author David Gerard
#'
#' @noRd
ll_ao <- function(par, q, x) {
  gf <- (1 - par) * q + par / length(q)
  if (is.matrix(x)) {
    stopifnot(ncol(x) == length(q))
    ll <- llike_li(B = x, lpivec = log(gf))
  } else {
    stopifnot(length(x) == length(q))
    ll <- stats::dmultinom(x = x, prob = gf, log = TRUE)
  }
  return(ll)
}

#' @title Parallelized likelihood ratio test for segregation distortion for
#' arbitrary (even) ploidies.
#'
#' @description
#' Uses the future package to implement parallelization support for
#' the likelihood ratio tests for segregation distortion. Details of
#' this test are provided in the \code{\link{seg_lrt}()} function's
#' documentation. See Gerard et al. (2025) for details of the methods.
#'
#' @inheritSection seg_lrt Null Model
#'
#' @inheritSection seg_lrt Unidentified Parameters
#'
#' @section Parallel Computation:
#'
#' The \code{seg_multi()} function supports parallel computing. It does
#' so through the \href{https://cran.r-project.org/package=future}{future}
#' package.
#'
#' You first specify the evaluation plan with \code{\link[future]{plan}()}
#' from the \code{future} package. On a local machine, this is typically
#' just \code{future::plan(future::multisession, workers = nc)} where
#' \code{nc} is the number of workers you want. You can find the maximum
#' number of possible workers with \code{\link[future]{availableCores}()}.
#' You then run \code{seg_multi()}, then shut down the workers with
#' \code{future::plan(future::sequential)}. The pseudo code is
#' \preformatted{
#'   future::plan(future::multisession, workers = nc)
#'   seg_multi()
#'   future::plan(future::sequential)
#' }
#'
#' @param g One of two inputs
#'   \itemize{
#'     \item{A matrix of genotype counts. The rows index the loci and the columns index the genotypes.}
#'     \item{An array of genotype log-likelihoods. The rows index the loci, the columns index the individuals, and the slices index the genotypes. Log-likelihoods are base e (natural log).}
#'   }
#' @param p1 One of three inputs
#'   \itemize{
#'     \item{A vector of parent 1's genotypes.}
#'     \item{A matrix of parent 1's genotype log-likelihoods. The rows index the loci and the columns index the genotypes. Logs are in base e (natural log).}
#'     \item{\code{NULL} (only supported when using genotype likelihoods for the offspring)}
#'   }
#' @param p2 One of three inputs
#'   \itemize{
#'     \item{A vector of parent 1's genotypes.}
#'     \item{A matrix of parent 1's genotype log-likelihoods. The rows index the loci and the columns index the genotypes. Logs are in base e (natural log).}
#'     \item{\code{NULL} (only supported when using genotype likelihoods for the offspring)}
#'   }
#' @inheritParams seg_lrt
#'
#' @author David Gerard
#'
#' @return A data frame with the following elements:
#' \describe{
#'   \item{\code{statistic}}{The likelihood ratio test statistic}
#'   \item{\code{p_value}}{The p-value of the likelihood ratio test.}
#'   \item{\code{df}}{The (estimated) degrees of freedom of the test.}
#'   \item{\code{null_bic}}{The BIC of the null model (no segregation distortion).}
#'   \item{\code{df0}}{The (estimated) number of parameters under null.}
#'   \item{\code{df1}}{The (estimated) number of parameters under the alternative.}
#'   \item{\code{p1}}{The (estimated) genotype of parent 1.}
#'   \item{\code{p2}}{The (estimated) genotype of parent 2.}
#'   \item{\code{q0}}{The MLE of the genotype frequencies under the null.}
#'   \item{\code{q1}}{The MLE of the genotype frequencies under the alternative.}
#'   \item{\code{outprob}}{Outlier probabilities. Only returned in \code{ret_out = TRUE}.
#'     \itemize{
#'       \item{If using genotype counts, element \code{i} is the probability that an individual \emph{with genotype} \code{i-1} is an outlier. So the return vector has length ploidy plus 1.}
#'       \item{If using genotype log-likelihoods, element \code{i} is the probability that individual \code{i} is an outlier. So the return vector has the same length as the number of individuals.}
#'       }
#'     These outlier probabilities are only valid if the null of no segregation is true.
#'    }
#' }
#' Note that since this data frame contains the list-columns \code{q0} and
#' \code{q1}, you cannot use \code{\link[utils]{write.csv}()} to save it.
#' You have to either remove those columns first or use something
#' like \code{\link[base]{saveRDS}()}
#'
#' @seealso
#' - [seg_lrt()] Single locus LRT for segregation distortion.
#' - [gamfreq()] Gamete frequencies under various models of meiosis
#' - [gf_freq()] F1 genotype frequencies under various models of meiosis.
#' - [multidog_to_g()] Converts the output of `updog::multidog()` into something that you can input into `seg_multi()`.
#'
#' @examples
#' \donttest{
#' ## Assuming genotypes are known (typically a bad idea)
#' glist <- multidog_to_g(
#'   mout = ufit,
#'   ploidy = 4,
#'   type = "all_g",
#'   p1 = "indigocrisp",
#'   p2 = "sweetcrisp")
#' p1_1 <- glist$p1
#' p2_1 <- glist$p2
#' g_1 <- glist$g
#' s1 <- seg_multi(
#'   g = g_1,
#'   p1_ploidy = 4,
#'   p2_ploidy = 4,
#'   p1 = p1_1,
#'   p2 = p2_1)
#' s1[, c("snp", "p_value")]
#'
#' ## Put NULL if you have absolutely no information on the parents
#' s2 <- seg_multi(g = g_1, p1_ploidy = 4, p2_ploidy = 4, p1 = NULL, p2 = NULL)
#' s2[, c("snp", "p_value")]
#'
#' ## Using genotype likelihoods (typically a good idea)
#' ## Also demonstrate parallelization through future package.
#' glist <- multidog_to_g(
#'   mout = ufit,
#'   ploidy = 4,
#'   type = "all_gl",
#'   p1 = "indigocrisp",
#'   p2 = "sweetcrisp")
#' p1_2 <- glist$p1
#' p2_2 <- glist$p2
#' g_2 <- glist$g
#'
#' # future::plan(future::multisession, workers = 2)
#' # s3 <- seg_multi(
#' #   g = g_2,
#' #   p1_ploidy = 4,
#' #   p2_ploidy = 4,
#' #   p1 = p1_2,
#' #   p2 = p2_2,
#' #   ret_out = TRUE)
#' # future::plan(future::sequential)
#' # s3[, c("snp", "p_value")]
#'
#' ## Outlier probabilities are returned if `ret_out = TRUE`
#' # graphics::plot(s3$outprob[[6]], ylim = c(0, 1))
#' }
#'
#' @references
#' \itemize{
#'   \item{Gerard, D, Ambrosano, GB, Pereira, GdS, & Garcia, AAF (2025). Tests for segregation distortion in higher ploidy F1 populations. \emph{bioRxiv}, p. 1-20. \doi{10.1101/2025.06.23.661114}}
#' }
#'
#' @export
seg_multi <- function(
    g,
    p1_ploidy,
    p2_ploidy = p1_ploidy,
    p1 = NULL,
    p2 = NULL,
    model = c("seg", "auto", "auto_dr", "allo", "allo_pp", "auto_allo"),
    outlier = TRUE,
    ret_out = FALSE,
    ob = 0.03,
    db = c("ces", "prcs"),
    ntry = 3,
    df_tol = 1e-3) {

  model <- match.arg(model)
  db <- match.arg(db)

  if (is.null(p1)) {
    p1 <- rep(NA_real_, length.out = dim(g)[[1]])
    names(p1) <- dimnames(g)[[1]]
  }
  if (is.null(p2)) {
    p2 <- rep(NA_real_, length.out = dim(g)[[1]])
    names(p2) <- dimnames(g)[[1]]
  }

  ## Check input --------------------------------------------------------------
  if (length(dim(g)) == 3) {
    if (inherits(p1, "matrix") && inherits(p2, "matrix")) {
      type <- "glp"
      stopifnot(dimnames(g)[[1]] == rownames(p1))
      stopifnot(dimnames(g)[[1]] == rownames(p2))
    } else {
      type <- "glo"
      stopifnot(dimnames(g)[[1]] == names(p1))
      stopifnot(dimnames(g)[[1]] == names(p2))
    }
    nloc <- dim(g)[[1]]
    nind <- dim(g)[[2]]
  } else if (length(dim(g)) == 2) {
    type <- "g"
    nloc <- nrow(g)
  }

  ## Register doFuture  -------------------------------------------------------
  oldDoPar <- doFuture::registerDoFuture()
  on.exit(with(oldDoPar, foreach::setDoPar(fun=fun, data=data, info=info)), add = TRUE)

  i <- NULL
  ret <- foreach::foreach(
    i = seq_len(nloc),
    .combine = rbind,
    .export = c("g", "p1", "p2", "ret_out", "seg_lrt")
    ) %dorng% {
      if (type == "glp") {
        g_now <- g[i, , ]
        p1_now <- p1[i, ]
        p2_now <- p2[i, ]
      } else if (type == "glo") {
        g_now <- g[i, , ]
        p1_now <- if(is.na(p1[[i]])) NULL else p1[[i]]
        p2_now <- if(is.na(p2[[i]])) NULL else p2[[i]]
      } else if (type == "g") {
        g_now <- g[i, ]
        p1_now <- if(is.na(p1[[i]])) NULL else p1[[i]]
        p2_now <- if(is.na(p2[[i]])) NULL else p2[[i]]
      }
        lout <- seg_lrt(
          x = g_now,
          p1_ploidy = p1_ploidy,
          p2_ploidy = p2_ploidy,
          p1 = p1_now,
          p2 = p2_now,
          model = model,
          outlier = outlier,
          ob = ob,
          db = db,
          ntry = ntry,
          df_tol = df_tol,
          ret_out = ret_out)
        row_now <- as.data.frame(
          c(
            lout[c("stat", "df", "p_value", "null_bic")],
            df0 = lout$null$df0,
            df1 = lout$alt$df1,
            p1 = lout$null$gam[[1]]$g,
            p2 = lout$null$gam[[2]]$g
          )
        )
        row_now$q0 <- vector(mode = "list", length = 1)
        row_now$q0[[1]] <- lout$null[["q0"]]
        row_now$q1 <- vector(mode = "list", length = 1)
        row_now$q1[[1]] <- lout$alt[["q1"]]
        if (ret_out) {
          row_now$outprob <- vector(mode = "list", length = 1)
          row_now$outprob[[1]] <- lout$outprob
        }
        row_now
    }
  ret$snp <- dimnames(g)[[1]]
  rownames(ret) <- NULL

  attr(ret, "rng") <- NULL
  attr(ret, "doRNG_version") <- NULL

  return(ret)
}

#' Calculate outlier probability based on the fit from \code{\link{seg_lrt}()}
#'
#' @param g One of two inputs
#'   \itemize{
#'     \item{A matrix of genotype counts. The rows index the loci and the columns index the genotypes.}
#'     \item{An array of genotype log-likelihoods. The rows index the loci, the columns index the individuals, and the slices index the genotypes. Log-likelihoods are base e (natural log).}
#'   }
#' @param gam The \code{null$gam} output from \code{\link{seg_lrt}()}.
#'
#' @return A numeric vector.
#' \itemize{
#'   \item{If using genotype counts, element \code{i} is the probability that an individual \emph{with genotype} \code{i-1} is an outlier. So the return vector has length ploidy plus 1.}
#'   \item{If using genotype log-likelihoods, element \code{i} is the probability that individual \code{i} is an outlier. So the return vector has the same length as the number of individuals.}
#' }
#'
#' @author David Gerard
#'
#' @noRd
#'
#' @examples
#' set.seed(1)
#' g <- c(5, 5, 0, 0, 1)
#' gl <- simgl(g)
#' sout1 <- seg_lrt(x = g, p1_ploidy = 4, p2_ploidy = 4, p1 = 1, p2 = 0)
#' sout2 <- seg_lrt(x = gl, p1_ploidy = 4, p2_ploidy = 4, p1 = 1, p2 = 0)
#' outprob(g = g, gam = sout1$null$gam)
#' outprob(g = gl, gam = sout2$null$gam)
#'
outprob <- function(g, gam) {
  if (is.matrix(g)) {
    data_type <- "glike"
  } else {
    data_type <- "gcount"
  }

  if (!gam[[3]]$outlier) {
    if (data_type == "glike") {
      ret <- rep(x = 0, times = nrow(g))
      names(ret) <- rownames(g)
      return(ret)
    } else if (data_type == "gcount") {
      ret <- rep(x = 0, times = length(g))
      names(ret) <- 0:(length(g) - 1)
      return(ret)
    }
  }

  ## Get null model
  pi <- gam[[3]]$pi
  p1 <- do.call(what = gamfreq, args = gam[[1]])
  p2 <- do.call(what = gamfreq, args = gam[[2]])
  q <- convolve_2(p1 = p1, p2 = p2, nudge =0)

  if (data_type == "glike") {
    f1_1mpi <- apply(X = g + log(q)[col(g)], MARGIN = 1, FUN = log_sum_exp) + log(1 - pi)
    f0_pi <- apply(X = g + log(1 / length(q)), MARGIN = 1, FUN = log_sum_exp) + log(pi)
    ret <- exp(f0_pi - mapply(FUN = log_sum_exp_2, x = f0_pi, y = f1_1mpi))
    names(ret) <- names(g)
  } else if (data_type == "gcount") {
    ret <- (pi / length(q)) / (pi / length(q) + (1 - pi) * q)
    names(ret) <- 0:(length(q) - 1)
  } else {
    stop("outprob: blah")
  }

  return(ret)
}

Try the segtest package in your browser

Any scripts or data that you put into this service are public.

segtest documentation built on July 1, 2025, 1:07 a.m.