edgee: Edgeworth Expansions and Higher-Order Inference

Documented in smpStats

#' Sample statistics for Edgeworth expansions
#' 
#' Calculate sample statistics needed for Edgeworth expansions.
#' 
#' @param smp sample.
#' @param a vector of the same length as \code{smp} specifying categories of 
#'   observations (should contain two unique values). Treatment code is assumed
#'   to have a higher numeric value than control (relevant for \code{type =
#'   "Welch"}).
#' @inheritParams empEdge
#' @param unbiased.mom \code{logical} value indicating if unbiased estimators 
#'   for third through sixth central moments should be used.
#' @param moder \code{logical} value indicating if Edgeworth expansions for a
#'   moderated t-statistic will be used. If \code{TRUE}, prior information
#'   (\code{d0} and \code{s20}) and posterior variance should be provided.
#' @param d0 prior degrees of freedom (needed if \code{moder = TRUE}).
#' @param s20 prior value for variance (needed if \code{moder = TRUE}).
#' @param varpost posterior variance (needed if \code{moder = TRUE}).
#'   
#' @return A named vector of sample statistics to be used in Edgeworth
#'   exansions. The calculated statistics and corresponding names are: \itemize{
#'   \item for ordinary one-sample t-statistic: scaled cumulants named
#'   \code{"lam3", "lam4", "lam5", "lam6"}; \item for moderated one-sample
#'   t-statistic: central moment estimates named \code{"mu2", "mu3", "mu4",
#'   "mu5", "mu6"}, \code{A}, \code{B}, and prior degrees of freedom named
#'   \code{"d0"}; \item for ordinary two-sample t-statistic: central moment
#'   estimates repeated twice since the same distribution is assumed for two
#'   groups, named \code{"mu_x2", "mu_x3", "mu_x4", "mu_x5", "mu_x6"} and
#'   \code{"mu_y2", "mu_y3", "mu_y4", "mu_y5", "mu_y6"}, \code{A, B_x, B_y,
#'   b_x}, and \code{b_y}; \item  for moderated two-sample t-statistic:
#'   estimates of the same quantities as for ordinary t (with different
#'   estimators); additionally, prior degrees of freedom named \code{"d0"} is
#'   included; \item for Welch t-test: estimates of the same quantities as for
#'   ordinary t-statistic (with different estimators). In this case, central
#'   moment estimates for treatment and control groups are different.}
#'   
#' @examples 
#' # simulate sample - one-sample test
#' n <- 10
#' smp <- rlnorm(n, sdlog = 0.6)  
#' stats <- smpStats(smp)
#' stats
#' t <- sqrt(n)*mean(smp)/sd(smp)
#' tailDiag(stats, n)
#' Ft <- makeFx(stats, n, base = "t")
#' Ft(t)
#' 
#' # two-sample test
#' n2 <- 8
#' smp2 <- c(smp, rnorm(n2))
#' a <- rep(0:1, c(n, n2))
#' smpStats(smp2, a, unbiased.mom = FALSE)
#' 
#' # moderated t-statistic
#' if (requireNamespace("limma")) {
#'   # simulate high-dimensional data
#'   m  <- 1e4          # number of tests
#'   ns <- 0.05*m       # number of significant features
#'   dat <- matrix(rgamma(m*n, shape = 3) - 3, nrow = m)
#'   shifts <- runif(ns, 1, 5)
#'   dat[1:ns, ] <- dat[1:ns, ] - shifts
#'   # estimate prior information
#'   fit <- limma::lmFit(dat, rep(1, n))
#'   fbay <- limma::eBayes(fit)
#'   # look at one feature (row of data)
#'   i <- 625
#'   smpStats(dat[i, ], moder = TRUE, d0 = fbay$df.prior, s20 = fbay$s2.prior, 
#'            varpost = fbay$s2.post[i])
#' }
#'   
#' @seealso \code{\link{tailDiag}}, \code{\link{makeFx}}, and
#'   \code{\link{makeQx}} for functions that require \code{stats} argument
#'   corresponding to the output of \code{smpStats()}.
#' @export
smpStats <- function(smp, a = NULL, type = NULL, unbiased.mom = TRUE,
                     moder = FALSE, d0 = NULL, s20 = NULL, varpost = NULL) {
  n <- length(smp)
  if (is.null(a)) {
    type <- "one-sample"
  } else {
    if (length(a) != n) stop("design does not match data")
    if (length(unique(a)) == 1) {
      type <- "one-sample"
    } else if (length(unique(a)) == 2) {
      nx <- sum(a == max(a))
      ny <- n - nx
      n <- n/2
      if (is.null(type)) {
        type <- "two-sample"
      }
    } else {
      stop("more than two categories in the sample")
    }
  }
  
  if (type == "one-sample") {
    if (unbiased.mom) {
      mu <- getMomEdgeUnb(smp)
    } else {
      mu <- getMomEdgeBias(smp)
    }
    if (!moder) {
      return(getLam(mu))
    } else {
      if (is.null(d0))      stop("Please provide d0")
      if (is.null(s20))     stop("Please provide s20")
      if (is.null(varpost)) stop("Please provide posterior variance")
      mu[1] <- varpost
      A <- (d0*s20 + n*mu[1])/(d0 + n - 1)
      names(A) <- NULL  # remove carryover from mu
      B <- n/(d0 + n - 1)
      return(c(mu, A = A, B = B, d0 = d0))
    }
  }
  
  if (type == "two-sample") {
    bx <- n/nx
    by <- n/ny
    Cxy <- (nx + ny)/(nx + ny - 2)
    Bx <- Cxy*by
    By <- Cxy*bx
    if (unbiased.mom) {
      mu <- getMomEdgeUnb2(smp, a)
    } else {
      mu <- getMomEdgeBias2(smp, a)
    }
    if (!moder) {
      A <- Cxy*(bx + by)*mu[1]
      names(A) <- NULL  # remove carryover from mu
      stats <- rep(mu, 2)
      names(stats) <- c(paste("mu_x", 2:6, sep = ""), 
                        paste("mu_y", 2:6, sep = ""))
      return(c(stats, A = A, B_x = Bx, B_y = By, b_x = bx, b_y = by))
    } else {
      if (is.null(d0))      stop("Please provide d0")
      if (is.null(s20))     stop("Please provide s20")
      if (is.null(varpost)) stop("Please provide posterior variance")
      mu[1] <- varpost
      dg <- nx + ny - 2
      A <- (bx + by)*(d0*s20 + Cxy*dg*mu[1])/(d0 + dg)
      names(A) <- NULL  # remove carryover from mu
      Bx <- Bx*dg/(d0 + dg)
      By <- By*dg/(d0 + dg)
      stats <- rep(mu, 2)
      names(stats) <- c(paste("mu_x", 2:6, sep = ""), 
                        paste("mu_y", 2:6, sep = ""))
      return(c(stats, A = A, B_x = Bx, B_y = By, b_x = bx, b_y = by, d0 = d0))
    }
  }
  
  if (type %in% c("welch", "Welch")) {
    if (is.null(a) | length(unique(a)) != 2) stop("a does not match test type")
    bx <- n/nx
    by <- n/ny
    Cx <- nx/(nx - 1)
    Cy <- ny/(ny - 1)
    Bx <- Cx*bx
    By <- Cy*by
    treat <- a == max(a)
    smpx <- smp[treat]
    smpy <- smp[!treat]
    if (unbiased.mom) {
      mux <- getMomEdgeUnb(smpx)
      muy <- getMomEdgeUnb(smpy)
    } else {
      mux <- getMomEdgeBias(smpx)
      muy <- getMomEdgeBias(smpy)
    }
    A <- Cx*bx*mux[1] + Cy*by*muy[1]
    names(A) <- NULL  # remove carryover from mu
    stats <- c(mux, muy)
    names(stats) <- c(paste("mu_x", 2:6, sep = ""), 
                      paste("mu_y", 2:6, sep = ""))
    return(c(stats, A = A, B_x = Bx, B_y = By, b_x = bx, b_y = by))
  }
}