R/ustat.R
In hwep: Hardy-Weinberg Equilibrium in Polyploids

Documented in hweustat

##################################
## U-statistic approach to HWE and DR
##################################

#' Objective used in \code{\link{hweustat}()}
#'
#' @param nvec A vector of length ploidy + 1. \code{nvec[i]} is the observed
#'     number of individuals with dosage \code{i-1}.
#' @param alpha The double reduction parameter. Should be of length
#'     \code{floor(ploidy/4)}. These should sum to less than 1, since
#'     \code{1-sum(alpha)} is the assumed probability of no double
#'     reduction.
#' @param which_keep A logical vector of length \code{ploidy + 1}. All of the
#'     \code{FALSE}'s will be combined with each other. They form the last
#'     group.
#' @param omega A \code{sum(which_keep)+1} by \code{sum(which_keep)+1}
#'     positive semidefinite matrix.
#'     This is an estimate of the \emph{generalized inverse} covariance
#'     of the elements of the U-statistic. The last row and column
#'     correspond to all aggregated groups determined by \code{which_keep}.
#'
#' @author David Gerard
#'
#' @noRd
uobj <- function(nvec, alpha, omega = NULL, which_keep = NULL) {
  ## check parameters ----
  ploidy <- length(nvec) - 1
  stopifnot(ploidy %% 2 == 0, ploidy > 0)
  stopifnot(length(alpha) == floor(ploidy / 4))
  stopifnot(nvec >= 0)
  if (is.null(which_keep)) {
    which_keep <- rep(TRUE, ploidy + 1)
  }
  stopifnot(is.logical(which_keep), length(which_keep) == ploidy + 1)
  numkeep <- sum(which_keep)
  if (!is.null(omega)) {
    if (numkeep == ploidy + 1) {
      stopifnot(dim(omega) == c(ploidy + 1, ploidy + 1))
    } else {
      stopifnot(dim(omega) == c(numkeep + 1, numkeep + 1))
    }
  }

  ## Calculate objective ----
  n <- sum(nvec)
  qhat <- nvec / n
  fq <- freqnext(freq = qhat, alpha = alpha, check = FALSE)

  if (numkeep != ploidy + 1) {
    qhat <- c(qhat[which_keep], sum(qhat[!which_keep]))
    fq   <- c(fq[which_keep], sum(fq[!which_keep]))
  }

  if (is.null(omega)) {
    diff <- (qhat - fq)
    return(sum(diff^2) * n)
  } else {
    diff <- matrix((qhat - fq), ncol = 1)
    return(c(t(diff) %*% omega %*% diff) * n)
  }
}

#' Generalized inverse of a symmetric p.d. matrix
#'
#' This is the Moore-Penrose inverse of a \emph{symmetric positive semi-definite}
#' matrix.
#'
#' @param omega A matrix to invert. Must be symmetric and positive semi-definite.
#' @param tol The tolerance on the eigenvalues to discard.
#'
#' @author David Gerard
#'
#' @noRd
ginv <- function(omega, tol = sqrt(.Machine$double.eps)) {
  stopifnot(is.matrix(omega))
  stopifnot(nrow(omega) == ncol(omega))

  eout <- eigen(omega, symmetric = TRUE)

  stopifnot(eout$values > -tol)

  which_pos <- eout$values > tol

  f <- c(1 / eout$values[which_pos], rep(0, length.out = sum(!which_pos)))

  return(list(mat = eout$vectors %*% diag(f) %*% t(eout$vectors),
              rank = sum(which_pos)))
}


#' Same as \code{\link{ginv}()}, but explicitly specify rank
#'
#' @param Q The asymptotic covariance matrix, or an estimate of such matrix.
#' @param df The degrees of freedom to use.
#'
#' @author David Gerard
#'
#' @noRd
projme <- function(Q, df = nrow(Q) - 1) {
  stopifnot(is.matrix(Q))
  K <- nrow(Q)
  stopifnot(nrow(Q) == ncol(Q))
  stopifnot(df >= 0, df <= K)

  eout <- eigen(Q, symmetric = TRUE)
  f <- c(1 / eout$values[1:df], rep(0, K - df))
  return(mat = eout$vectors %*% diag(f) %*% t(eout$vectors))
}

#' \emph{Inverse} covariance estimate of elements of ustat
#'
#' This is the theoretical one, based on U-statistic theory.
#'
#' @inheritParams uobj
#'
#' @return Covariance estimate.
#'
#' @author David Gerard
#'
#' @noRd
ucov <- function(nvec, alpha) {
  ## Check parameters ----
  ploidy <- length(nvec) - 1
  stopifnot(ploidy %% 2 == 0, ploidy > 0)
  stopifnot(length(alpha) == floor(ploidy / 4))
  stopifnot(nvec >= 0)

  ## Calculate covariance ----
  n <- sum(nvec)
  qhat <- nvec / n
  fq <- freqnext(freq = qhat, alpha = alpha, check = FALSE)
  A <- aperm(zsegarray(alpha = alpha, ploidy = ploidy), c(3, 1, 2)) ## put offspring in first index

  Qmat <- diag(fq) - tcrossprod(fq)

  bread <- diag(ploidy + 1) -
    2 * tensr::mat(
      A = tensr::amprod(A = A, M = matrix(fq, nrow = 1), k = 2),
      k = 3
      )

  omega <- t(bread) %*% Qmat %*% bread

  return(omega)
}

#' U-process minimizer approach to equilibrium testing and double reduction
#' estimation
#'
#' Estimates double reduction and tests for equilibrium while accounting
#' for double reduction. It does this using an approach called
#' "U-process minimization", where we minimize a function of a U-statistic
#' that should be 0 at equilibrium given the true double reduction rate.
#'
#' This is a two-step estimator, where we first obtain a consistent
#' estimate of the double reduction parameter, use this to
#' estimate the covariance of estimators, then use this to obtain
#' our final estimate of the double reduction parameter.
#'
#' @param nvec A vector containing the observed genotype counts,
#'     where \code{nvec[[i]]} is the number of individuals with genotype
#'     \code{i-1}. This should be of length \code{ploidy+1}.
#' @param thresh The threshold for ignoring the genotype. We keep
#'     genotypes such that \code{nvec >= thresh}.
#'     Setting this to \code{0} uses all genotypes. Setting this to
#'     \code{NULL} uses a heuristic that works well in practice.
#' @param effdf A logical. Should we use the ad-hoc
#'     "effective degrees of freedom" (\code{TRUE}) or not (\code{FALSE})?
#'
#' @return A list with some or all of the following elements:
#' \describe{
#'   \item{\code{alpha}}{The estimated double reduction parameter(s).
#'       In diploids, this value is \code{NULL}.}
#'   \item{\code{chisq_hwe}}{The chi-square test statistic for testing
#'       against the null of equilibrium.}
#'   \item{\code{df_hwe}}{The degrees of freedom associated with
#'       \code{chisq_hwe}.}
#'   \item{\code{p_hwe}}{The p-value against the null of equilibrium.}
#' }
#'
#' @author David Gerard
#'
#' @examples
#' set.seed(1)
#' ploidy <- 6
#' size <- 1000
#' r <- 0.1
#' alpha <- 0.1
#' qvec <- hwefreq(r = r, alpha = alpha, ploidy = ploidy)
#' nvec <- c(rmultinom(n = 1, size = size, prob = qvec))
#' hweustat(nvec = nvec)
#'
#' @export
hweustat <- function(nvec,
                     thresh = NULL,
                     effdf = TRUE) {
  if (is.null(thresh)) { ## heuristic works well in practice
    posst <- sum(nvec) * 0.01
    if (posst <= 5) {
      thresh <- 5
    } else if (posst >= 10) {
      thresh <- 10
    } else {
      thresh <- round(posst)
    }
  }

  ploidy <- length(nvec) - 1
  stopifnot(ploidy %% 2 == 0, ploidy >= 4)
  ibdr <- floor(ploidy / 4)
  stopifnot(thresh >= 0, length(thresh) == 1)
  stopifnot(is.logical(effdf), length(effdf) == 1)
  minval <- sqrt(.Machine$double.eps)

  ## Choose which groups to aggregate ----
  which_keep <- choose_agg(x = nvec, thresh = thresh)

  ## Return early if too few groups ----
  if (sum(which_keep) - ibdr <= 0) {
    return(
      list(
        alpha = rep(NA_real_, ibdr),
        chisq_hwe = NA_real_,
        df_hwe = NA_real_,
        p_hwe = NA_real_
      )
    )
  }

  ## Create aggregation matrix ----
  numkeep <- sum(which_keep)
  H <- aggfun(which_keep = which_keep)
  if (numkeep >= ploidy) {
    ## aggregating one group = no aggregation at all.
    which_keep <- rep(TRUE, ploidy + 1)
    numkeep <- ploidy + 1
  }

  ## Upper bounds on alpha ----
  upper_alpha <- drbounds(ploidy = ploidy)

  ## optimization method ----
  ometh <- ifelse(ibdr == 1, "Brent", "L-BFGS-B")

  ## Run two-step procedure ---------------------------------------------------

  ## step 1 ----
  oout <- stats::optim(par = rep(minval, ibdr),
                       fn = uobj,
                       method = ometh,
                       lower = rep(minval, ibdr),
                       upper = upper_alpha,
                       nvec = nvec,
                       omega = NULL,
                       which_keep = which_keep)

  ## Calculate covariance ----
  alpha_tilde <- oout$par
  covmat <- ucov(nvec = nvec, alpha = alpha_tilde)
  projout <- ginv(H %*% covmat %*% t(H))
  omega <- projout$mat

  ## step 2 ----
  oout <- stats::optim(par = rep(minval, ibdr),
                       fn = uobj,
                       method = ometh,
                       lower = rep(minval, ibdr),
                       upper = upper_alpha,
                       nvec = nvec,
                       omega = omega,
                       which_keep = which_keep)

  ## Get alpha ----
  alpha <- oout$par

  ## Calculate chi-square statistic ---
  chisq_hwe <- oout$value

  ## Calculate degrees of freedom ----
  if (effdf) {
    dfadd <- sum((abs(alpha - minval) < 10 * minval) | (abs(alpha - upper_alpha) < 10 * minval))
  } else {
    dfadd <- 0
  }
  df_hwe <- projout$rank - ibdr + dfadd

  ## Run test ----
  p_hwe <- stats::pchisq(q = chisq_hwe, df = df_hwe, lower.tail = FALSE)

  ## return ----
  retlist <- list(
    alpha = alpha,
    chisq_hwe = chisq_hwe,
    df_hwe = df_hwe,
    p_hwe = p_hwe
  )

  retlist$p_hwe[retlist$df_hwe == 0] <- NA_real_

  return(retlist)
}