R/ashdog.R

Defines functions get_dimname get_q_array flex_update_pivec initialize_pivec is.flexdog plot.flexdog flexdog_full flexdog

Documented in flexdog flexdog_full get_q_array is.flexdog plot.flexdog

## Functions for flexdog

#' Flexible genotyping for polyploids from next-generation sequencing data.
#'
#' Genotype polyploid individuals from next generation
#' sequencing (NGS) data while assuming the genotype distribution is one of
#' several forms. \code{flexdog} does this while accounting for allele bias,
#' overdispersion, sequencing error. The method is described
#' in detail in Gerard et. al. (2018) and Gerard and Ferrão (2020). See
#' \code{\link{multidog}()} for running flexdog on multiple SNPs in parallel.
#'
#' @inherit flexdog_full
#'
#' @param bias_init A vector of initial values for the bias parameter
#'     over the multiple runs of \code{\link{flexdog_full}()}.
#' @param ... Additional parameters to pass to \code{\link{flexdog_full}()}.
#'
#' @seealso Run \code{browseVignettes(package = "updog")} in R for example usage.
#'     Other useful functions include:
#' \describe{
#'     \item{\code{\link{multidog}()}}{For running \code{flexdog()} on multiple
#'           SNPs in parallel.}
#'     \item{\code{\link{flexdog_full}()}}{For additional parameter options
#'           when running \code{flexdog()}.}
#'     \item{\code{\link{rgeno}()}}{For simulating genotypes under the allowable
#'           prior models in \code{flexdog()}.}
#'     \item{\code{\link{rflexdog}()}}{For simulating read-counts under the
#'           assumed likelihood model in \code{flexdog()}.}
#'     \item{\code{\link{plot.flexdog}()}}{For plotting the output of
#'           \code{flexdog()}.}
#'     \item{\code{\link{oracle_mis}()}}{For calculating the oracle genotyping
#'           error rates. This is useful for read-depth calculations
#'           \emph{before} collecting data. After you have data, using
#'           the value of \code{prop_mis} is better.}
#'     \item{\code{\link{oracle_cor}()}}{For calculating the correlation
#'           between the true genotypes and an oracle estimator
#'           (useful for read-depth calculations \emph{before}
#'           collecting data).}
#' }
#'
#' @author David Gerard
#'
#' @examples
#' \donttest{
#'
#' ## An S1 population where the first individual
#' ## is the parent.
#' data("snpdat")
#' ploidy  <- 6
#' refvec  <- snpdat$counts[snpdat$snp == "SNP2"]
#' sizevec <- snpdat$size[snpdat$snp == "SNP2"]
#' fout    <- flexdog(refvec   = refvec[-1],
#'                    sizevec  = sizevec[-1],
#'                    ploidy   = ploidy,
#'                    model    = "s1",
#'                    p1ref    = refvec[1],
#'                    p1size   = sizevec[1])
#' plot(fout)
#'
#' }
#'
#' ## A natural population. We will assume a
#' ## normal prior since there are so few
#' ## individuals.
#' data("uitdewilligen")
#' ploidy  <- 4
#' refvec  <- uitdewilligen$refmat[, 1]
#' sizevec <- uitdewilligen$sizemat[, 1]
#' fout    <- flexdog(refvec  = refvec,
#'                    sizevec = sizevec,
#'                    ploidy  = ploidy,
#'                    model   = "norm")
#' plot(fout)
#'
#'
#'
#' @export
flexdog <- function(refvec,
                    sizevec,
                    ploidy,
                    model       = c("norm",
                                    "hw",
                                    "bb",
                                    "s1",
                                    "s1pp",
                                    "f1",
                                    "f1pp",
                                    "flex",
                                    "uniform",
                                    "custom"),
                    p1ref       = NULL,
                    p1size      = NULL,
                    p2ref       = NULL,
                    p2size      = NULL,
                    snpname     = NULL,
                    bias_init   = exp(c(-1, -0.5, 0, 0.5, 1)),
                    verbose     = TRUE,
                    prior_vec   = NULL,
                    ...) {
  assertthat::assert_that(all(bias_init > 0))
  model <- match.arg(model)

  if (verbose) {
    if ((length(refvec) < (10 * (ploidy + 1))) & (model == "flex")) {
      cat(paste0("Very few individuals for model = \"flex\"",
                 "\nYou might want to try model = \"norm\" instead.\n\n"))
    }
  }

  fout <- list()
  fout$llike <- -Inf
  for (bias_index in seq_along(bias_init)) {
    if (verbose) {
      cat("         Fit:", bias_index, "of", length(bias_init), "\n")
      cat("Initial Bias:", bias_init[bias_index], "\n")
    }

    fcurrent <- flexdog_full(refvec    = refvec,
                             sizevec   = sizevec,
                             ploidy    = ploidy,
                             model     = model,
                             p1ref     = p1ref,
                             p1size    = p1size,
                             p2ref     = p2ref,
                             p2size    = p2size,
                             snpname   = snpname,
                             bias      = bias_init[bias_index],
                             verbose   = FALSE,
                             prior_vec = prior_vec,
                             ...)

    if (verbose) {
      cat("Log-Likelihood:", fcurrent$llike, "\n")
    }

    if (fcurrent$llike > fout$llike) {
      fout <- fcurrent

      if (verbose) {
        cat("Keeping new fit.\n\n")
      }
    } else if (verbose) {
      cat("Keeping old fit.\n\n")
    }
  }

  if (verbose) {
    cat("Done!\n")
  }

  return(fout)
}

#' Flexible genotyping for polyploids from next-generation sequencing data.
#'
#' Genotype polyploid individuals from next generation
#' sequencing (NGS) data while assuming the genotype distribution is one of
#' several forms. \code{\link{flexdog_full}()} does this while accounting for allele bias,
#' overdispersion, and sequencing error. This function has more
#' options than \code{\link{flexdog}} and is only meant for expert users.
#' The method is described in detail in Gerard et. al. (2018) and
#' Gerard and Ferrão (2020).
#'
#' Possible values of the genotype distribution (values of \code{model}) are:
#' \describe{
#'   \item{\code{"norm"}}{A distribution whose genotype frequencies are proportional
#'       to the density value of a normal with some mean and some standard deviation.
#'       Unlike the \code{"bb"} and \code{"hw"} options, this will allow for
#'       distributions both more and less dispersed than a binomial.
#'       This seems to be the most robust to violations in modeling assumptions, and so is the
#'       default. This prior class was developed in Gerard and Ferrão (2020).}
#'   \item{\code{"hw"}}{A binomial distribution that results from assuming that
#'       the population is in Hardy-Weinberg equilibrium (HWE). This actually does
#'       pretty well even when there are minor to moderate deviations from HWE.
#'       Though it does not perform as well as the `"norm"` option when there
#'       are severe deviations from HWE.}
#'   \item{\code{"bb"}}{A beta-binomial distribution. This is an overdispersed
#'       version of \code{"hw"} and can be derived from a special case of the Balding-Nichols model.}
#'   \item{\code{"s1"}}{This prior assumes the individuals are
#'       all full-siblings resulting
#'       from one generation of selfing. I.e. there is only
#'       one parent. This model assumes
#'       a particular type of meiotic behavior: polysomic
#'       inheritance with
#'       bivalent, non-preferential pairing.}
#'   \item{\code{"f1"}}{This prior assumes the individuals are all
#'       full-siblings resulting
#'       from one generation of a bi-parental cross.
#'       This model assumes
#'       a particular type of meiotic behavior: polysomic
#'       inheritance with
#'       bivalent, non-preferential pairing.}
#'   \item{\code{"f1pp"}}{This prior allows for double reduction
#'       and preferential pairing in an F1 population of tretraploids.}
#'   \item{\code{"s1pp"}}{This prior allows for double reduction
#'       and preferential pairing in an S1 population of tretraploids.}
#'   \item{\code{"flex"}}{Generically any categorical distribution. Theoretically,
#'       this works well if you have a lot of individuals. In practice, it seems to
#'       be much less robust to violations in modeling assumptions.}
#'   \item{\code{"uniform"}}{A discrete uniform distribution. This should never
#'       be used in practice.}
#'   \item{\code{"custom"}}{A pre-specified prior distribution. You specify
#'       it using the \code{prior_vec} argument. You should almost never
#'       use this option in practice.}
#' }
#'
#' You might think a good default is \code{model = "uniform"} because it is
#' somehow an "uninformative prior." But it is very informative and tends to
#' work horribly in practice. The intuition is that it will estimate
#' the allele bias and sequencing error rates so that the estimated genotypes
#' are approximately uniform (since we are assuming that they are approximately
#' uniform). This will usually result in unintuitive genotyping since most
#' populations don't have a uniform genotype distribution.
#' I include it as an option only for completeness. Please don't use it.
#'
#' The value of \code{prop_mis} is a very intuitive measure for
#' the quality of the SNP. \code{prop_mis} is the posterior
#' proportion of individuals mis-genotyped. So if you want only SNPS
#' that accurately genotype, say, 95\% of the individuals, you could
#' discard all SNPs with a \code{prop_mis} over \code{0.05}.
#'
#' The value of \code{maxpostprob} is a very intuitive measure
#' for the quality of the genotype estimate of an individual.
#' This is the posterior probability of correctly genotyping
#' the individual when using \code{geno} (the posterior mode)
#' as the genotype estimate. So if you want to correctly genotype,
#' say, 95\% of individuals, you could discard all individuals
#' with a \code{maxpostprob} of under \code{0.95}. However, if you are
#' just going to impute missing genotypes later, you might consider
#' not discarding any individuals as \code{flexdog}'s genotype estimates will
#' probably be more accurate than other more naive approaches, such
#' as imputing using the grand mean.
#'
#' In most datasets I've examined, allelic bias is a major issue. However,
#' you may fit the model assuming no allelic bias by setting
#' \code{update_bias = FALSE} and \code{bias_init = 1}.
#'
#' Prior to using \code{flexdog}, during the read-mapping step,
#' you could try to get rid of allelic bias by
#' using WASP (\doi{10.1101/011221}). If you are successful
#' in removing the allelic bias (because its only source was the read-mapping
#' step), then setting \code{update_bias = FALSE} and \code{bias_init = 1}
#' would be reasonable. You can visually inspect SNPs for bias by
#' using \code{\link{plot_geno}()}.
#'
#' \code{flexdog()}, like most methods, is invariant to which allele you
#' label as the "reference" and which you label as the "alternative".
#' That is, if you set \code{refvec} with the number of alternative
#' read-counts, then the resulting genotype estimates
#' will be the estimated allele dosage of the alternative allele.
#'
#' @param refvec A vector of counts of reads of the reference allele.
#' @param sizevec A vector of total counts.
#' @param ploidy The ploidy of the species. Assumed to be the same for each
#'     individual.
#' @param model What form should the prior (genotype distribution) take?
#'     See Details for possible values.
#' @param verbose Should we output more (\code{TRUE}) or less
#'     (\code{FALSE})?
#' @param mean_bias The prior mean of the log-bias.
#' @param var_bias The prior variance of the log-bias.
#' @param mean_seq The prior mean of the logit of the sequencing
#'     error rate.
#' @param var_seq The prior variance of the logit of the sequencing
#'     error rate.
#' @param mean_od The prior mean of the logit of the overdispersion parameter.
#' @param var_od The prior variance of the logit of the overdispersion
#'     parameter.
#' @param seq The starting value of the sequencing error rate.
#' @param bias The starting value of the bias.
#' @param od The starting value of the overdispersion parameter.
#' @param itermax The total number of EM iterations to
#'     run.
#' @param tol The tolerance stopping criterion. The EM algorithm will stop
#'     if the difference in the log-likelihoods between two consecutive
#'     iterations is less than \code{tol}.
#' @param update_bias A logical. Should we update \code{bias}
#'     (\code{TRUE}), or not (\code{FALSE})?
#' @param update_seq A logical. Should we update \code{seq}
#'     (\code{TRUE}), or not (\code{FALSE})?
#' @param update_od A logical. Should we update \code{od}
#'     (\code{TRUE}), or not (\code{FALSE})?
#' @param fs1_alpha The value at which to fix
#'     the mixing proportion for the uniform component
#'      when \code{model = "f1"}, \code{model = "s1"},
#'      \code{model = "f1pp"}, or \code{model = "s1pp"}.
#'      I would recommend some small value such as \code{10^-3}.
#' @param p1ref The reference counts for the first parent if
#'     \code{model = "f1"} or \code{model = "f1pp"}, or for
#'     the only parent if \code{model = "s1"} or \code{model = "s1pp"}.
#' @param p1size The total counts for the first parent if
#'     \code{model = "f1"} or \code{model = "f1pp"},
#'     or for the only parent if \code{model = "s1"} or \code{model = "s1pp"}.
#' @param p2ref The reference counts for the second parent if
#'     \code{model = "f1"} or \code{model = "f1pp"}.
#' @param p2size The total counts for the second parent if
#'     \code{model = "f1"} or \code{model = "f1pp"}.
#' @param snpname A string. The name of the SNP under consideration.
#'     This is just returned in the \code{input} list for your reference.
#' @param prior_vec The pre-specified genotype distribution. Only used if
#'     \code{model = "custom"} and must otherwise be \code{NULL}. If specified,
#'     then it should be a vector of length \code{ploidy + 1} with
#'     non-negative elements that sum to 1.
#' @param seq_upper The upper bound on the possible sequencing error rate.
#'     Default is 0.05, but you should adjust this if you have prior knowledge
#'     on the sequencing error rate of the sequencing technology.
#'
#' @return An object of class \code{flexdog}, which consists
#'     of a list with some or all of the following elements:
#' \describe{
#'   \item{\code{bias}}{The estimated bias parameter.}
#'   \item{\code{seq}}{The estimated sequencing error rate.}
#'   \item{\code{od}}{The estimated overdispersion parameter.}
#'   \item{\code{num_iter}}{The number of EM iterations ran. You should
#'       be wary if this equals \code{itermax}.}
#'   \item{\code{llike}}{The maximum marginal log-likelihood.}
#'   \item{\code{postmat}}{A matrix of posterior probabilities of each
#'       genotype for each individual. The rows index the individuals
#'       and the columns index the allele dosage.}
#'   \item{\code{genologlike}}{A matrix of genotype \emph{log}-likelihoods of each
#'       genotype for each individual. The rows index the individuals
#'       and the columns index the allele dosage.}
#'   \item{\code{gene_dist}}{The estimated genotype distribution. The
#'       \code{i}th element is the proportion of individuals with
#'       genotype \code{i-1}.}
#'   \item{\code{par}}{A list of the final estimates of the parameters
#'       of the genotype distribution. The elements included in \code{par}
#'       depends on the value of \code{model}:
#'       \describe{
#'       \item{\code{model = "norm"}:}{
#'         \describe{
#'           \item{\code{mu}:}{The normal mean.}
#'           \item{\code{sigma}:}{The normal standard deviation (not variance).}
#'         }
#'       }
#'       \item{\code{model = "hw"}:}{
#'         \describe{
#'           \item{\code{alpha}:}{The major allele frequency.}
#'         }
#'       }
#'       \item{\code{model = "bb"}:}{
#'         \describe{
#'           \item{\code{alpha}:}{The major allele frequency.}
#'           \item{\code{tau}:}{The overdispersion parameter. See the
#'               description of \code{rho} in the Details of
#'               \code{\link{betabinom}()}.}
#'         }
#'       }
#'       \item{\code{model = "s1"}:}{
#'         \describe{
#'           \item{\code{pgeno}:}{The allele dosage of the parent.}
#'           \item{\code{alpha}:}{The mixture proportion of the discrete
#'               uniform (included and fixed at a small value mostly for
#'               numerical stability reasons). See the description
#'               of \code{fs1_alpha} in \code{\link{flexdog_full}()}.}
#'         }
#'       }
#'       \item{\code{model = "f1"}:}{
#'         \describe{
#'           \item{\code{p1geno}:}{The allele dosage of the first parent.}
#'           \item{\code{p2geno}:}{The allele dosage of the second parent.}
#'           \item{\code{alpha}:}{The mixture proportion of the discrete
#'               uniform (included and fixed at a small value mostly for
#'               numerical stability reasons). See the description
#'               of \code{fs1_alpha} in \code{\link{flexdog_full}()}.}
#'         }
#'       }
#'       \item{\code{model = "s1pp"}:}{
#'         \describe{
#'           \item{\code{ell1}:}{The estimated dosage of the parent.}
#'           \item{\code{tau1}:}{The estimated double reduction parameter
#'               of the parent. Available if \code{ell1} is \code{1}, \code{2},
#'               or \code{3}. Identified if \code{ell1} is \code{1} or \code{3}.}
#'           \item{\code{gamma1}:}{The estimated preferential pairing parameter.
#'               Available if \code{ell1} is \code{2}. However, it is not
#'               returned in an identified form.}
#'           \item{\code{alpha}:}{The mixture proportion of the discrete
#'               uniform (included and fixed at a small value mostly for
#'               numerical stability reasons). See the description
#'               of \code{fs1_alpha} in \code{\link{flexdog_full}()}.}
#'         }
#'       }
#'       \item{\code{model = "f1pp"}:}{
#'         \describe{
#'           \item{\code{ell1}:}{The estimated dosage of parent 1.}
#'           \item{\code{ell2}:}{The estimated dosage of parent 2.}
#'           \item{\code{tau1}:}{The estimated double reduction parameter
#'               of parent 1. Available if \code{ell1} is \code{1}, \code{2},
#'               or \code{3}. Identified if \code{ell1} is \code{1} or \code{3}.}
#'           \item{\code{tau2}:}{The estimated double reduction parameter
#'               of parent 2. Available if \code{ell2} is \code{1}, \code{2},
#'               or \code{3}. Identified if \code{ell2} is \code{1} or \code{3}.}
#'           \item{\code{gamma1}:}{The estimated preferential pairing parameter
#'               of parent 1. Available if \code{ell1} is \code{2}. However,
#'               it is not returned in an identified form.}
#'           \item{\code{gamma2}:}{The estimated preferential pairing parameter
#'               of parent 2. Available if \code{ell2} is \code{2}. However,
#'               it is not returned in an identified form.}
#'           \item{\code{alpha}:}{The mixture proportion of the discrete
#'               uniform (included and fixed at a small value mostly for
#'               numerical stability reasons). See the description
#'               of \code{fs1_alpha} in \code{\link{flexdog_full}()}.}
#'         }
#'       }
#'       \item{\code{model = "flex"}:}{\code{par} is an empty list.}
#'       \item{\code{model = "uniform"}:}{\code{par} is an empty list.}
#'       \item{\code{model = "custom"}:}{\code{par} is an empty list.}
#'       }}
#'   \item{\code{geno}}{The posterior mode genotype. These are your
#'       genotype estimates.}
#'   \item{\code{maxpostprob}}{The maximum posterior probability. This
#'       is equivalent to the posterior probability of correctly
#'       genotyping each individual.}
#'   \item{\code{postmean}}{The posterior mean genotype. In downstream
#'       association studies, you might want to consider using these
#'       estimates.}
#'   \item{\code{input$refvec}}{The value of \code{refvec} provided by
#'       the user.}
#'   \item{\code{input$sizevec}}{The value of \code{sizevec} provided
#'       by the user.}
#'   \item{\code{input$ploidy}}{The value of \code{ploidy} provided
#'       by the user.}
#'   \item{\code{input$model}}{The value of \code{model} provided by
#'       the user.}
#'   \item{\code{input$p1ref}}{The value of \code{p1ref} provided by the user.}
#'   \item{\code{input$p1size}}{The value of \code{p1size} provided by the user.}
#'   \item{\code{input$p2ref}}{The value of \code{p2ref} provided by the user.}
#'   \item{\code{input$p2size}}{The value of \code{p2size} provided by the user.}
#'   \item{\code{input$snpname}}{The value of \code{snpname} provided by the user.}
#'   \item{\code{prop_mis}}{The posterior proportion of individuals
#'       genotyped incorrectly.}
#' }
#'
#' @author David Gerard
#'
#' @references
#' \itemize{
#'   \item{Gerard, D., Ferrão, L. F. V., Garcia, A. A. F., & Stephens, M. (2018). Genotyping Polyploids from Messy Sequencing Data. \emph{Genetics}, 210(3), 789-807. \doi{10.1534/genetics.118.301468}.}
#'   \item{Gerard, David, and Luís Felipe Ventorim Ferrão. "Priors for genotyping polyploids." Bioinformatics 36, no. 6 (2020): 1795-1800. \doi{10.1093/bioinformatics/btz852}.}
#' }
#'
#' @seealso
#' Run \code{browseVignettes(package = "updog")} in R for example usage.
#'     Other useful functions include:
#' \describe{
#'     \item{\code{\link{multidog}}}{For running \code{flexdog} on multiple
#'           SNPs in parallel.}
#'     \item{\code{\link{flexdog}}}{For a more user-friendly version of
#'           \code{flexdog_full}.}
#'     \item{\code{\link{rgeno}}}{For simulating genotypes under the allowable
#'           prior models in \code{flexdog}.}
#'     \item{\code{\link{rflexdog}}}{For simulating read-counts under the
#'           assumed likelihood model in \code{flexdog}.}
#'     \item{\code{\link{plot.flexdog}}}{For plotting the output of
#'           \code{flexdog}.}
#'     \item{\code{\link{oracle_mis}}}{For calculating the oracle genotyping
#'           error rates. This is useful for read-depth calculations
#'           \emph{before} collecting data. After you have data, using
#'           the value of \code{prop_mis} is better.}
#'     \item{\code{\link{oracle_cor}}}{For calculating the correlation
#'           between the true genotypes and an oracle estimator
#'           (useful for read-depth calculations \emph{before}
#'           collecting data).}
#' }
#'
#' @examples
#' ## A natural population. We will assume a
#' ## normal prior since there are so few
#' ## individuals.
#' data("uitdewilligen")
#' ploidy  <- 4
#' refvec  <- uitdewilligen$refmat[, 1]
#' sizevec <- uitdewilligen$sizemat[, 1]
#' fout    <- flexdog_full(refvec  = refvec,
#'                         sizevec = sizevec,
#'                         ploidy  = ploidy,
#'                         model   = "norm")
#' plot(fout)
#'
#' @export
flexdog_full <- function(refvec,
                         sizevec,
                         ploidy,
                         model       = c("norm",
                                         "hw",
                                         "bb",
                                         "s1",
                                         "s1pp",
                                         "f1",
                                         "f1pp",
                                         "flex",
                                         "uniform",
                                         "custom"),
                         verbose     = TRUE,
                         mean_bias   = 0,
                         var_bias    = 0.7 ^ 2,
                         mean_seq    = -4.7,
                         var_seq     = 1,
                         mean_od     = -5.5,
                         var_od      = 0.5 ^ 2,
                         seq         = 0.005,
                         bias        = 1,
                         od          = 0.001,
                         update_bias = TRUE,
                         update_seq  = TRUE,
                         update_od   = TRUE,
                         itermax     = 200,
                         tol         = 10 ^ -4,
                         fs1_alpha   = 10 ^ -3,
                         p1ref       = NULL,
                         p1size      = NULL,
                         p2ref       = NULL,
                         p2size      = NULL,
                         snpname     = NULL,
                         prior_vec   = NULL,
                         seq_upper   = 0.05) {

  ## Check input -----------------------------------------------------
  model <- match.arg(model)
  if (model == "uniform") {
    warning(paste0("flexdog: Using model = 'uniform'",
                   "\nis almost always a bad idea.",
                   "\nTry model = 'hw' or model = 'norm'",
                   "\nif you have data from a population study."))
  }

  assertthat::assert_that(seq_upper > 0)
  assertthat::assert_that(seq_upper < 1)
  assertthat::are_equal(length(refvec), length(sizevec))
  assertthat::assert_that(all(sizevec >= refvec, na.rm = TRUE))
  assertthat::assert_that(all(refvec >= 0, na.rm = TRUE))
  assertthat::are_equal(1, length(verbose))
  assertthat::are_equal(1, length(mean_bias))
  assertthat::are_equal(1, length(var_bias))
  assertthat::are_equal(1, length(mean_seq))
  assertthat::are_equal(1, length(var_seq))
  assertthat::are_equal(1, length(seq))
  assertthat::are_equal(1, length(bias))
  assertthat::are_equal(1, length(od))
  assertthat::are_equal(1, length(mean_od))
  assertthat::are_equal(1, length(var_od))
  assertthat::are_equal(1, length(seq_upper))
  assertthat::assert_that(var_bias > 0)
  assertthat::assert_that(var_seq > 0)
  assertthat::assert_that(seq >= 0, seq <= 1)
  assertthat::assert_that(bias > 0)
  assertthat::assert_that(od >= 0, od <= 1)
  assertthat::assert_that(is.logical(verbose))
  assertthat::are_equal(ploidy %% 1, 0)
  assertthat::assert_that(ploidy > 0)
  assertthat::assert_that(tol > 0)
  assertthat::are_equal(itermax %% 1, 0)
  assertthat::assert_that(itermax > 0)
  assertthat::assert_that(is.logical(update_bias))
  assertthat::assert_that(is.logical(update_seq))
  assertthat::assert_that(is.logical(update_od))

  ## check fs1_alpha -----------------------------------------------
  assertthat::are_equal(length(fs1_alpha), 1)
  assertthat::assert_that(fs1_alpha <= 1, fs1_alpha >= 0)

  ## Check custom --------------------------------------------------
  if (!is.null(prior_vec) & model != "custom") {
    stop('Cannot specify `prior_vec` when model != "custom"')
  } else if (!is.null(prior_vec)) {
    stopifnot(prior_vec >= 0)
    stopifnot(prior_vec <= 1)
    stopifnot(abs(sum(prior_vec) - 1) < 10^-6)
    stopifnot(length(prior_vec) == ploidy + 1)
  } else if (model == "custom" & is.null(prior_vec)) {
    stop('must specify `prior_vec` when model = "custom"')
  }

  ## Check p1ref, p2ref, p1size, p2size ----------------------------
  if ((!is.null(p1ref) | !is.null(p1size)) & (model != "f1" & model != "s1" & model != "f1pp" & model != "s1pp")) {
    stop("flexdog: if model is not 'f1', 's1', 'f1pp', or 's1pp' then p1ref and p1size both need to be NULL.")
  }
  if ((!is.null(p2ref) | !is.null(p2size)) & (model != "f1" & model != "f1pp")) {
    stop("flexdog: if model is not 'f1' or 'f1pp' then p2ref and p2size both need to be NULL.")
  }
  if ((is.null(p1ref) & !is.null(p1size)) | (!is.null(p1ref) & is.null(p1size))) {
    stop("flexdog: p1ref and p1size either need to be both NULL or both non-NULL.")
  }
  if ((is.null(p2ref) & !is.null(p2size)) | (!is.null(p2ref) & is.null(p2size))) {
    stop("flexdog: p1ref and p1size either need to be both NULL or both non-NULL.")
  }
  if (!is.null(p1ref)) {
    stopifnot(p1ref >= 0, p1size >= p1ref)
  }
  if (!is.null(p2ref)) {
    stopifnot(p2ref >= 0, p2size >= p2ref)
  }
  if (!is.null(snpname)) {
    stopifnot(is.character(snpname))
    stopifnot(length(snpname) == 1)
  }

  ## Preferential pairing only supported for tetraploids right now ------------
  if ((model == "f1pp" | model == "s1pp") & ploidy != 4) {
    stop("Currently, `model = \"f1pp\"` and `model = \"s1pp\"` are only supported when ploidy = 4.")
  }

  ## Initialization for HW
  mode <- mean(refvec / sizevec, na.rm = TRUE)

  ## Deal with missingness in sizevec and refvec -----------------------
  not_na_vec  <- !(is.na(refvec) | is.na(sizevec))
  refvec      <- refvec[not_na_vec]
  sizevec     <- sizevec[not_na_vec]

  ## Some variables needed to run EM ---------------------------
  control <- list() ## will contain parameters used to update pivec
  boundary_tol <- 10 ^ -6 ## smallest values of seq, bias, od,
                          ## how close can od and seq get to 1.
  if (od < boundary_tol) {
    od <- boundary_tol
  }
  if (seq < boundary_tol) {
    seq <- boundary_tol
  }
  if (bias < boundary_tol) {
    bias <- boundary_tol
  }
  if (od > 1 - boundary_tol) {
    od <- 1 - boundary_tol
  }
  if (seq > 1 - boundary_tol) {
    seq <- 1 - boundary_tol
  }

  ## since re-write these
  bias_init <- bias
  seq_init  <- seq
  od_init   <- od

  ## Some control variables
  if (model == "f1" | model == "s1") {
    control$qarray    <- get_q_array(ploidy = ploidy)
    control$fs1_alpha <- fs1_alpha
  } else if (model == "f1pp" | model == "s1pp") {
    control$fs1_alpha <- fs1_alpha
  } else if (model == "bb") {
    control$alpha <- mode ## initialize allele frequency for bb
    control$tau   <- boundary_tol ## initialize od for bb
  } else if (model == "norm") {
    control$mu    <- mode * ploidy ## initialize mean of normal
    control$sigma <- sqrt(ploidy * mode * (1 - mode)) ## initialize sd of normal
  } else if (model == "custom") {
    control$prior_vec <- prior_vec
  }

  pivec <- initialize_pivec(ploidy = ploidy, mode = mode, model = model)
  if (model == "custom") { ## hack to get around passing more arguments to initialize_pivec
    pivec <- prior_vec
  }
  assertthat::are_equal(sum(pivec), 1)
  control$pivec <- pivec

  ## Run EM ----------------------------------------
  iter_index  <- 1
  err         <- tol + 1
  llike       <- -Inf
  while (err > tol & iter_index <= itermax) {
    llike_old <- llike

    ## E-step ----------------------
    wik_mat <- get_wik_mat(probk_vec = pivec,
                           refvec    = refvec,
                           sizevec   = sizevec,
                           ploidy    = ploidy,
                           seq       = seq,
                           bias      = bias,
                           od = od)

    ## Update seq, bias, and od ----
    oout <- stats::optim(par         = c(seq, bias, od),
                         fn          = obj_for_eps,
                         gr          = grad_for_eps,
                         method      = "L-BFGS-B",
                         lower       = rep(boundary_tol, 3),
                         upper       = c(seq_upper, Inf,
                                         1 - boundary_tol),
                         control     = list(fnscale = -1, maxit = 20),
                         refvec      = refvec,
                         sizevec     = sizevec,
                         ploidy      = ploidy,
                         mean_bias   = mean_bias,
                         var_bias    = var_bias,
                         mean_seq    = mean_seq,
                         var_seq     = var_seq,
                         mean_od     = mean_od,
                         var_od      = var_od,
                         wmat        = wik_mat,
                         update_seq  = update_seq,
                         update_bias = update_bias,
                         update_od   = update_od)
    seq  <- oout$par[1]
    bias <- oout$par[2]
    od   <- oout$par[3]

    ## if F1 or S1, update betabinomial log-likelihood of parent counts
    if (model == "f1" | model == "f1pp") {
      if (!is.null(p1ref)) {
        xi_vec <- xi_fun(p = 0:ploidy / ploidy, eps = seq, h = bias)
        control$p1_lbb <- dbetabinom(x    = rep(p1ref, ploidy + 1),
                                     size = rep(p1size, ploidy + 1),
                                     mu   = xi_vec,
                                     rho  = od,
                                     log  = TRUE)
      }
      if (!is.null(p2ref)) {
        xi_vec <- xi_fun(p = 0:ploidy / ploidy, eps = seq, h = bias)
        control$p2_lbb <- dbetabinom(x    = rep(p2ref, ploidy + 1),
                                     size = rep(p2size, ploidy + 1),
                                     mu   = xi_vec,
                                     rho  = od,
                                     log  = TRUE)
      }
    } else if (model == "s1" | model == "s1pp") {
      if (!is.null(p1ref)) {
        xi_vec <- xi_fun(p = 0:ploidy / ploidy, eps = seq, h = bias)
        control$p1_lbb <- dbetabinom(x    = rep(p1ref, ploidy + 1),
                                     size = rep(p1size, ploidy + 1),
                                     mu   = xi_vec,
                                     rho  = od,
                                     log  = TRUE)
      }
    } else {
      ## do nothing
    }

    ## Update pivec ----------------
    weight_vec <- colSums(wik_mat)
    fupdate_out <- flex_update_pivec(weight_vec = weight_vec,
                                     model      = model,
                                     control    = control)
    pivec <- fupdate_out$pivec
    control$pivec <- pivec

    ## New initialization parameters for priors that use gradient ascent.
    if (model == "bb") {
      control$alpha <- fupdate_out$par$alpha
      control$tau   <- fupdate_out$par$tau
    } else if (model == "norm") {
      control$mu    <- fupdate_out$par$mu
      control$sigma <- fupdate_out$par$sigma
    }

    ## Fix pivec from small numerical deviations -------------------------------
    pivec[pivec < 0] <- 0
    pivec[pivec > 1] <- 1
    pivec <- pivec / sum(pivec)

    ## Calculate likelihood and update stopping criteria --------------
    llike <- flexdog_obj(probk_vec = pivec,
                         refvec    = refvec,
                         sizevec   = sizevec,
                         ploidy    = ploidy,
                         seq       = seq,
                         bias      = bias,
                         od        = od,
                         mean_bias = mean_bias,
                         var_bias  = var_bias,
                         mean_seq  = mean_seq,
                         var_seq   = var_seq,
                         mean_od   = mean_od,
                         var_od    = var_od)

    err        <- abs(llike - llike_old)
    iter_index <- iter_index + 1

    if (llike < llike_old - 10 ^ -5) {
      warning(paste0("flexdog: likelihood not increasing.\nDifference is: ",
                     llike - llike_old))
      cat("Warn:\n")
      cat("Diff:", llike - llike_old, "\n")
      cat("p1geno:", fupdate_out$par$p1geno, "\n")
      cat("p2geno:", fupdate_out$par$p2geno, "\n")
      cat("llike:", format(llike, digits = 10), "\n")
      cat("llike_old:", format(llike_old, digits = 10), "\n\n")
      cat("pivec:", pivec, "\n")
      if (verbose) {
        cat("\nindex: ", iter_index, "\n")
        cat("llike: ", llike, "\n")
        cat("pivec: ", paste0("c(", paste0(pivec, collapse = ", "), ")"), "\n")
        cat("weight_vec: ",  paste0("c(", paste0(weight_vec, collapse = ", "), ")"), "\n\n")
      }
    }
  }

  ## End EM ------------------------------------------------------------------

  ## Initialize return list
  return_list <- list(bias      = bias,
                      seq       = seq,
                      od        = od,
                      num_iter  = iter_index,
                      llike     = llike,
                      postmat   = wik_mat,
                      gene_dist = pivec,
                      par       = fupdate_out$par)
  return_list$genologlike <- get_genotype_likelihoods(refvec = refvec,
                                                      sizevec = sizevec,
                                                      ploidy = ploidy,
                                                      seq = seq,
                                                      bias = bias,
                                                      od = od)

  ## Summaries --------------------------------------------------------
  return_list$geno          <- apply(return_list$postmat, 1, which.max) - 1
  return_list$maxpostprob   <- return_list$postmat[cbind(seq_len(nrow(return_list$postmat)), return_list$geno + 1)]
  return_list$postmean      <- c(return_list$postmat %*% 0:ploidy)
  return_list$input$refvec  <- refvec
  return_list$input$sizevec <- sizevec
  return_list$input$ploidy  <- ploidy
  return_list$input$model   <- model
  return_list$input$p1ref   <- p1ref
  return_list$input$p1size  <- p1size
  return_list$input$p2ref   <- p2ref
  return_list$input$p2size  <- p2size
  return_list$input$snpname <- snpname
  return_list$prop_mis      <- 1 - mean(return_list$maxpostprob)

  ## Add back missingness ---------------------------------------------
  temp                     <- rep(NA, length = length(not_na_vec))
  temp[not_na_vec]         <- refvec
  return_list$input$refvec <- temp

  temp                      <- rep(NA, length = length(not_na_vec))
  temp[not_na_vec]          <- sizevec
  return_list$input$sizevec <- temp

  temp             <- rep(NA, nrow = length(not_na_vec))
  temp[not_na_vec] <- return_list$geno
  return_list$geno <- temp

  temp                    <- rep(NA, nrow = length(not_na_vec))
  temp[not_na_vec]        <- return_list$maxpostprob
  return_list$maxpostprob <- temp

  temp                 <- rep(NA, nrow = length(not_na_vec))
  temp[not_na_vec]     <- return_list$postmean
  return_list$postmean <- temp

  temp                <- matrix(NA, nrow = length(not_na_vec),
                                ncol = ncol(return_list$postmat))
  temp[not_na_vec, ]  <- return_list$postmat
  return_list$postmat <- temp

  temp                <- matrix(NA, nrow = length(not_na_vec),
                                ncol = ncol(return_list$genologlike))
  temp[not_na_vec, ]  <- return_list$genologlike
  return_list$genologlike <- temp

  ## Set class to flexdog ---------------------------------------------
  class(return_list) <- "flexdog"

  return(return_list)
}

#' Draw a genotype plot from the output of \code{\link{flexdog}}.
#'
#' A wrapper for \code{\link{plot_geno}}. This will create a genotype plot for a single SNP.
#'
#' On a genotype plot, the x-axis contains the counts of the non-reference allele and the y-axis
#' contains the counts of the reference allele. The dashed lines are the expected counts (both reference and alternative)
#' given the sequencing error rate and the allele-bias. The plots are color-coded by the maximum-a-posterior genotypes.
#' Transparency is proportional to the maximum posterior probability for an
#' individual's genotype. Thus, we are less certain of the genotype of more transparent individuals. These
#' types of plots are used in Gerard et. al. (2018) and Gerard and Ferrão (2020).
#'
#' @param x A \code{flexdog} object.
#' @param use_colorblind Should we use a colorblind-safe palette
#'     (\code{TRUE}) or not (\code{FALSE})? \code{TRUE} is only allowed
#'     if the ploidy is less than or equal to 6.
#' @param ... Not used.
#'
#' @author David Gerard
#'
#' @seealso
#' \describe{
#'   \item{\code{\link{plot_geno}}}{The underlying plotting function.}
#'   \item{\code{\link{flexdog}}}{Creates a \code{flexdog} object.}
#' }
#'
#' @references
#' \itemize{
#'   \item{Gerard, D., Ferrão, L. F. V., Garcia, A. A. F., & Stephens, M. (2018). Genotyping Polyploids from Messy Sequencing Data. \emph{Genetics}, 210(3), 789-807. \doi{10.1534/genetics.118.301468}.}
#'   \item{Gerard, David, and Luís Felipe Ventorim Ferrão. "Priors for genotyping polyploids." Bioinformatics 36, no. 6 (2020): 1795-1800. \doi{10.1093/bioinformatics/btz852}.}
#' }
#'
#' @return A \code{\link[ggplot2]{ggplot}} object for the genotype plot.
#'
#' @export
plot.flexdog <- function(x, use_colorblind = TRUE, ...) {
  assertthat::assert_that(is.flexdog(x))
  if (x$input$model == "s1") {
    p1geno <- x$par$pgeno
    p2geno <- NULL
  } else if (x$input$model == "f1") {
    p1geno <- x$par$p1geno
    p2geno <- x$par$p2geno
  } else {
    p1geno <- NULL
    p2geno <- NULL
  }

  pl <- plot_geno(refvec         = x$input$refvec,
                  sizevec        = x$input$sizevec,
                  ploidy         = x$input$ploidy,
                  geno           = x$geno,
                  seq            = x$seq,
                  bias           = x$bias,
                  maxpostprob    = x$maxpostprob,
                  p1ref          = x$input$p1ref,
                  p1size         = x$input$p1size,
                  p2ref          = x$input$p2ref,
                  p2size         = x$input$p2size,
                  p1geno         = p1geno,
                  p2geno         = p2geno,
                  use_colorblind = use_colorblind)
  return(pl)
}

#' Tests if an argument is a \code{flexdog} object.
#'
#' @param x Anything.
#'
#' @return A logical. \code{TRUE} if \code{x} is a \code{flexdog} object, and \code{FALSE} otherwise.
#'
#' @author David Gerard
#'
#' @export
#'
#' @examples
#' is.flexdog("anything")
#' # FALSE
#'
is.flexdog <- function(x) {
  inherits(x, "flexdog")
}


#' Initialize \code{pivec} for \code{\link{flexdog}} EM algorithm.
#'
#' The key idea here is choosing the pi's so that the two modes
#' have equal probability.
#'
#' @inheritParams flexdog_full
#'
#' @seealso \code{\link{flexdog}} for where this is used.
#'
#' @return A vector of numerics. The initial value of \code{pivec}
#'     used in \code{\link{flexdog_full}}.
#'
#' @keywords internal
#' @noRd
#'
#' @author David Gerard
initialize_pivec <- function(ploidy,
                             mode,
                             model = c("hw",
                                       "bb",
                                       "norm",
                                       "f1",
                                       "f1pp",
                                       "s1",
                                       "s1pp",
                                       "flex",
                                       "uniform",
                                       "custom")) {
  assertthat::are_equal(1, length(ploidy))
  assertthat::are_equal(1, length(mode))
  assertthat::are_equal(ploidy %% 1, 0)

  model <- match.arg(model)
  if (model == "flex" | model == "uniform") {
    pivec <- rep(x = 1 / (ploidy + 1), length = ploidy + 1)
  } else if (model == "hw" | model == "f1" | model == "s1" | model == "f1pp" | model == "s1pp" | model == "bb" | model == "norm" |  model == "custom") {
    if (mode < 0 | mode > 1) {
      stop(paste0("initialize_pivec: initialization should be between 0 and 1.",
                  "\nIt is the initialization of the allele frequency."))
    }
    pivec <- stats::dbinom(x = 0:ploidy, size = ploidy, prob = mode)
  } else {
    stop("initialize_pivec: How did you get here?")
  }
  return(pivec)
}


#' Update the distribution of genotypes from various models.
#'
#' @param weight_vec \code{colSums(wik_mat)} from \code{\link{flexdog}}.
#'     This is the sum of current posterior probabilities of each individual
#'     having genotype k.
#' @param model What model are we assuming? See the description in \code{\link{flexdog}} for details.
#' @param control A list of anything else needed to be passed.
#'
#' @return A list with the following elements
#' \describe{
#'   \item{\code{pivec}}{The estimate of the genotype distribution.}
#'   \item{\code{par}}{A list of estimated parameters. An empty list if the model does not contain any parameters other than \code{pivec}.}
#' }
#'
#' @keywords internal
#' @noRd
#'
#' @author David Gerard
flex_update_pivec <- function(weight_vec,
                              model = c("hw",
                                        "bb",
                                        "norm",
                                        "f1",
                                        "f1pp",
                                        "s1",
                                        "s1pp",
                                        "flex",
                                        "uniform",
                                        "custom"),
                              control) {
  ## Check input -------------------------------
  ploidy <- length(weight_vec) - 1
  model <- match.arg(model)
  assertthat::assert_that(is.list(control))

  ## Get pivec ---------------------------------
  return_list <- list()
  if (model == "flex") {
    pivec <- weight_vec / sum(weight_vec)
    return_list$pivec <- pivec
    return_list$par <- list()
  } else if (model == "hw") {
    alpha <- sum(0:ploidy * weight_vec) / (ploidy * sum(weight_vec))
    pivec <- stats::dbinom(x = 0:ploidy, size = ploidy, prob = alpha)
    return_list$pivec     <- pivec
    return_list$par       <- list()
    return_list$par$alpha <- alpha
  } else if (model == "f1") {
    optim_best       <- list()
    optim_best$value <- -Inf
    for (i in 0:ploidy) { ## parent 1
      for (j in 0:ploidy) { ## parent 2
        pvec <- control$qarray[i + 1, j + 1, ]
        optim_out <- list() ## called this for historical reasons.
        optim_out$value <- f1_obj(alpha = control$fs1_alpha,
                                  pvec = pvec,
                                  weight_vec = weight_vec)
        optim_out$par   <- control$fs1_alpha
        if (!is.null(control$p1_lbb)) {
          optim_out$value <- optim_out$value + control$p1_lbb[i + 1]
        }
        if (!is.null(control$p2_lbb)) {
          optim_out$value <- optim_out$value + control$p2_lbb[j + 1]
        }
        if (optim_out$value > optim_best$value) {
          optim_best <- optim_out
          optim_best$ell1 <- i
          optim_best$ell2 <- j
        }
      }
    }
    return_list$pivec <- (1 - optim_best$par) * control$qarray[optim_best$ell1 + 1, optim_best$ell2 + 1, ] +
      optim_best$par / (ploidy + 1)
    return_list$par <- list()
    return_list$par$p1geno <- optim_best$ell1
    return_list$par$p2geno <- optim_best$ell2
    return_list$par$alpha  <- optim_best$par
  } else if (model == "s1") {
    optim_best       <- list()
    optim_best$value <- -Inf
    for (i in 0:ploidy) { ## parent
      pvec <- control$qarray[i + 1, i + 1, ]
      optim_out <- list() ## named this for historical reasons
      optim_out$value <- f1_obj(alpha = control$fs1_alpha,
                                pvec = pvec,
                                weight_vec = weight_vec)
      optim_out$par   <- control$fs1_alpha
      if (!is.null(control$p1_lbb)) {
        optim_out$value <- optim_out$value + control$p1_lbb[i + 1]
      }
      if (optim_out$value > optim_best$value) {
        optim_best <- optim_out
        optim_best$ell <- i
      }
    }
    return_list$pivec <- (1 - optim_best$par) * control$qarray[optim_best$ell + 1, optim_best$ell + 1, ] +
      optim_best$par / (ploidy + 1)
    return_list$par <- list()
    return_list$par$pgeno <- optim_best$ell
    return_list$par$alpha  <- optim_best$par
  } else if (model == "f1pp") {
    update_vec <- update_f1_s1_pp(weightvec = weight_vec,
                                  pop = "f1",
                                  fs1alpha = control$fs1_alpha,
                                  p1pen = control$p1_lbb,
                                  p2pen = control$p2_lbb)
    return_list$par <- list()
    return_list$par$ell1 <- update_vec[["ell1"]]
    return_list$par$ell2 <- update_vec[["ell2"]]
    return_list$par$tau1 <- update_vec[["tau1"]]
    return_list$par$tau2 <- update_vec[["tau2"]]
    return_list$par$gamma1 <- update_vec[["gamma1"]]
    return_list$par$gamma2 <- update_vec[["gamma2"]]
    return_list$par$alpha  <- control$fs1_alpha
    return_list$pivec <- prob_dosage_pp_unif(ell1     = update_vec[["ell1"]],
                                             ell2     = update_vec[["ell2"]],
                                             tau1     = update_vec[["tau1"]],
                                             tau2     = update_vec[["tau2"]],
                                             gamma1   = update_vec[["gamma1"]],
                                             gamma2   = update_vec[["gamma2"]],
                                             fs1alpha = control$fs1_alpha)
  } else if (model == "s1pp") {
    update_vec <- update_f1_s1_pp(weightvec = weight_vec,
                                  pop = "s1",
                                  fs1alpha = control$fs1_alpha,
                                  p1pen = control$p1_lbb)
    return_list$par <- list()
    return_list$par$ell1 <- update_vec[["ell1"]]
    return_list$par$tau1 <- update_vec[["tau1"]]
    return_list$par$gamma1 <- update_vec[["gamma1"]]
    return_list$par$alpha  <- control$fs1_alpha
    return_list$pivec <- prob_dosage_pp_unif(ell1     = update_vec[["ell1"]],
                                             ell2     = update_vec[["ell1"]],
                                             tau1     = update_vec[["tau1"]],
                                             tau2     = update_vec[["tau1"]],
                                             gamma1   = update_vec[["gamma1"]],
                                             gamma2   = update_vec[["gamma1"]],
                                             fs1alpha = control$fs1_alpha)
  } else if (model == "uniform") {
    return_list$pivec <- rep(x = 1 / (ploidy + 1), length = ploidy + 1)
    return_list$par <- list()
  } else if (model == "bb") {
    boundary_val <- 10 ^ -8
    optim_out <- stats::optim(par        = c(control$alpha, control$tau),
                              fn         = obj_for_weighted_lbb,
                              gr         = grad_for_weighted_lbb,
                              method     = "L-BFGS-B",
                              lower      = c(boundary_val, boundary_val),
                              upper      = c(1 - boundary_val, 1 - boundary_val),
                              weight_vec = weight_vec,
                              ploidy     = ploidy,
                              control = list(fnscale = -1))

    return_list$pivec <- dbetabinom(x    = 0:ploidy,
                                    size = ploidy,
                                    mu   = optim_out$par[1],
                                    rho  = optim_out$par[2],
                                    log  = FALSE)
    return_list$par <- list()
    return_list$par$alpha <- optim_out$par[1]
    return_list$par$tau   <- optim_out$par[2]
  } else if (model == "norm") {
    optim_out <- stats::optim(par        = c(control$mu, control$sigma),
                              fn         = obj_for_weighted_lnorm,
                              gr         = grad_for_weighted_lnorm,
                              method     = "L-BFGS-B",
                              lower      = c(-1, 10 ^ -8),
                              upper      = c(ploidy + 1, Inf),
                              weight_vec = weight_vec,
                              ploidy     = ploidy,
                              control    = list(fnscale = -1))
    return_list$pivec <- stats::dnorm(x = 0:ploidy,
                                      mean = optim_out$par[1],
                                      sd = optim_out$par[2],
                                      log = TRUE)
    return_list$pivec <- exp(return_list$pivec - log_sum_exp(return_list$pivec))
    return_list$par       <- list()
    return_list$par$mu    <- optim_out$par[1]
    return_list$par$sigma <- optim_out$par[2]
  } else if (model == "custom") {
    return_list$pivec <- control$prior_vec
    return_list$par <- list()
  } else {
    stop("flex_update_pivec: how did you get here?")
  }
  return(return_list)
}


#' Return the probabilities of an offspring's genotype given its
#' parental genotypes for all possible combinations of parental and
#' offspring genotypes. This is for species with polysomal inheritance
#' and bivalent, non-preferential pairing.
#'
#' @param ploidy A positive integer. The ploidy of the species.
#'
#' @author David Gerard
#'
#' @return An three-way array of proportions. The (i, j, k)th element
#'     is the probability of an offspring having k - 1 reference
#'     alleles given that parent 1 has i - 1 reference alleles and
#'     parent 2 has j - 1 reference alleles. Each dimension of the
#'     array is \code{ploidy + 1}. In the dimension names, "A" stands
#'     for the reference allele and "a" stands for the
#'     alternative allele.
#'
#' @examples
#' qarray <- get_q_array(6)
#' apply(qarray, c(1, 2), sum) ## should all be 1's.
#'
#' @export
#'
get_q_array <- function(ploidy) {
  assertthat::assert_that(ploidy > 0)
  assertthat::are_equal(ploidy %% 2, 0)

  qarray <- array(0, dim = rep(ploidy + 1, 3))

  for(oindex in 0:ploidy) {
    for (p1index in 0:ploidy) {
      for (p2index in 0:ploidy) {
        if (p1index + p2index < oindex) {
          qarray[p1index + 1, p2index + 1, oindex + 1] <- 0
        } else {
          minval <- max(0, oindex - p2index)
          maxval <- min(ploidy / 2, p1index)
          aseq <- minval:maxval

          p1prob <- stats::dhyper(x = aseq, m = p1index, n = ploidy - p1index, k = ploidy / 2)
          p2prob <- stats::dhyper(x = oindex - aseq, m = p2index, n = ploidy - p2index, k = ploidy / 2)
          qarray[p1index + 1, p2index + 1, oindex + 1] <- sum(p1prob * p2prob)
        }
      }
    }
  }

  ## get dimnames

  dimvec <- get_dimname(ploidy)
  dimnames(qarray) <- list(parent1 = dimvec, parent2 = dimvec, offspring = dimvec)


  assertthat::assert_that(all(abs(apply(qarray, c(1, 2), sum) - 1) < 10 ^ -14))

  return(qarray)
}


#' Returns a vector character strings that are all of the possible
#' combinations of the reference allele and the non-reference allele.
#'
#' @param ploidy The ploidy of the species.
#'
#' @return For example, if \code{ploidy = 3} then this will return
#'     c("aaa", "Aaa", "AAa", "AAA")
#'
#' @author David Gerard
#'
#' @keywords internal
#' @noRd
#'
#'
get_dimname <- function(ploidy) {
  dimvec <- vapply(mapply(FUN = c, lapply(X = 0:ploidy, FUN = rep.int, x = "A"),
                          lapply(X = ploidy:0, FUN = rep.int, x = "a"),
                          SIMPLIFY = FALSE),
                   FUN = paste, collapse = "", FUN.VALUE = "character")
  return(dimvec)
}

Try the updog package in your browser

Any scripts or data that you put into this service are public.

updog documentation built on Oct. 18, 2022, 9:07 a.m.