#' Empirical Edgeworth expansions for high-dimensional data analysis
#'
#' Higher order inference for one- and two-sample t-tests in high-dimensional
#' data. Includes ordinary and moderated t-statistic and Welch t-test.
#'
#' Unadjusted p-values are calculated for five orders of approximation for
#' ordinary and moderated (empirical Bayes method) t-statistics; prior
#' information and moderated t-statistics are calculated with \code{limma}
#' package. If prior degrees of freedom is \code{Inf}, higher orders are
#' provided for ordinary t-statistic only. In a two-sample test, when the
#' variances (and distributions) are not assumed to be equal and Welch t-test is
#' performed, only results for ordinary t-statistic are provided. Variance
#' adjustment is used for all the orders (see the paper) and therefore even
#' first-order results might differ slightly from the regular Student's
#' t-distribution approximation. When a first-order p-value (for moderated
#' t-statistic if relevant) is greater than provided significance level
#' \code{alpha}, no higher order inference is calculated.
#'
#' Tail diagnostic investigating Edgeworth expansion (EE) tail behavior is
#' performed for each relevant feature (row of data); if EE of a particular
#' order is not determined to be helpful, p-value of a previous order is
#' provided in its place.
#'
#' For better performance of a second order, using \code{unbiased.mom = TRUE} is
#' recommended (default). For variance estimate, posterior variance is used for
#' moderated t-statistic and unbiased/pooled variance for ordinary t.
#'
#' @param dat data matrix with rows corresponding to features. The number of
#' columns is a sample size and number of rows is a number of tests. If the
#' number of tests is \code{1}, \code{dat} can be a vector.
#' @param a treatment vector; the length has to correspond to the number of
#' columns in \code{dat}. Treatment code is assumed to have a higher numeric
#' value than control.
#' @param side the test can be one-sided or two-sided. For a one-sided test, the
#' values are \code{"left"} or \code{"right"}.
#' @param type type of the test with possible values \code{"one-sample"},
#' \code{"two-sample"}, and \code{"Welch"}. For regular one- and two-sample
#' tests the value is inferred from \code{a} but for Welch t-test it needs to
#' be specified.
#' @param unbiased.mom \code{logical} value indicating if unbiased estimators
#' for third through sixth central moments should be used.
#' @param alpha significance level.
#' @param ncheck number of intervals for tail diagnostic.
#' @param lim tail region for tail diagnostic. Provide the endpoints for the
#' right tail (positive values).
#'
#' @return A matrix with the same number of rows as \code{dat}, each row
#' providing p-values for five orders of Edgeworth expansions (0 - 4-term
#' expansions) for a corresponding feature (row of data). Where applicable,
#' p-values will be provided for both ordinary and moderated t-statistics (10
#' columns, five orders each); for Welch t-test the matrix will have five
#' columns, and if prior degrees of freedom is \code{Inf}, only first order
#' p-values are returned for moderated t-statistic (six columns); note that
#' variance adjustment \eqn{r^2} is 1 in that case.
#'
#' @seealso \code{\link{tailDiag}} for tail daignostic, \code{\link{makeFx}},
#' \code{\link{Ftshort}}, and \code{\link{Ftgen}} for calculating Edgeworth
#' expansions of orders 1 to 5, and \code{\link{smpStats}} for extracting
#' statistics needed to calculate EE from a sample.
#'
#' @examples
#' # simulate a data set
#' nx <- 10 # sample size
#' m <- 1e4 # number of tests
#' ns <- 0.05*m # number of significant features
#' dat <- matrix(rgamma(m*nx, shape = 3) - 3, nrow = m)
#' shifts <- runif(ns, 1, 5)
#' dat[1:ns, ] <- dat[1:ns, ] - shifts
#' # run
#' res <- empEdge(dat)
#' head(res, 3)
#'
#' # one test (data not high-dimensional)
#' empEdge(dat[1, ], side = "left", unbiased.mom = FALSE, alpha = 0.1)
#'
#' # Welch test
#' ny <- 12
#' dat2 <- cbind(matrix(rnorm(m*ny), nrow = m), dat)
#' treat <- rep(0:1, c(ny, nx))
#' res <- empEdge(dat2, treat, type = "Welch", ncheck = 50, lim = c(1, 10))
#' head(res, 3)
#'
#' # prior degrees of freedom not finite
#' if (require(limma)) {
#' d0 <- 0
#' while (is.finite(d0)) {
#' dat <- matrix(rnorm(m*nx), nrow = m)
#' dat[1:ns, ] <- dat[1:ns, ] + shifts
#' fit <- lmFit(dat, rep(1, nx))
#' d0 <- ebayes(fit)$df.prior
#' }
#' }
#' res <- empEdge(dat, side = "right")
#' head(res, 3)
#'
#' @export
#' @useDynLib edgee
#' @importFrom stats dnorm pnorm dt pt var model.matrix
empEdge <- function(dat, a = NULL, side = "two-sided", type = NULL,
unbiased.mom = TRUE, alpha = 0.05, ncheck = 30,
lim = c(1, 7)) {
if (is.null(dim(dat)) || any(dim(dat) == 1)) {
dat <- matrix(dat, nrow = 1)
}
n <- ncol(dat)
m <- nrow(dat)
if (is.null(a)) {
type <- "one-sample"
design.mat <- rep(1, n)
n <- c(n, 0)
} else {
if (length(a) != n) stop("design does not match data")
if (length(unique(a)) == 1) {
type <- "one-sample"
design.mat <- rep(1, n)
} else if (length(unique(a)) == 2) {
# assume treatment code has higher numeric value than control
treat <- a == max(a)
n <- c(sum(treat), sum(!treat))
if (m == 1) {
dat <- matrix(c(dat[, treat], dat[, !treat]), nrow = 1)
} else {
dat <- cbind(dat[, treat], dat[, !treat])
}
design.mat <- model.matrix(~ rep(1:0, n))
if (is.null(type)) {
type <- "two-sample"
}
} else {
stop("more than two categories in the sample")
}
}
if (type %in% c("welch", "Welch")) {
if (is.null(a) | length(unique(a)) != 2) stop("a does not match test type")
co <- .C("empEdgeWelch", dat = as.double(dat),
nc = as.integer(n), nr = as.integer(m), alpha = as.double(alpha),
side = as.character(side), ncheck = as.integer(ncheck),
lim = as.double(lim), unbmom = as.integer(unbiased.mom),
pval = as.double(rep(0, m*5)))
if (side == "two-sided") {
co$pval <- 2*co$pval
}
co$pval <- matrix(co$pval, nrow = m, ncol = 5)
colnames(co$pval) <- c("t-dist", paste("term", 1:4, sep = ""))
return(co$pval)
}
if (!requireNamespace("limma", quietly = TRUE)) {
warning("No results for moderated t-statistics are provided;
please install package 'limma' from Bioconductor")
co <- .C("empEdgeOrd0", dat = as.double(dat),
nc = as.integer(n), nr = as.integer(m), alpha = as.double(alpha),
onesmp = as.integer(type == "one-sample"),
side = as.character(side), ncheck = as.integer(ncheck),
lim = as.double(lim), unbmom = as.integer(unbiased.mom),
pval = as.double(rep(0, m*5)))
if (side == "two-sided") {
co$pval <- 2*co$pval
}
co$pval <- matrix(co$pval, nrow = m, ncol = 5)
colnames(co$pval) <- c("t-dist", paste("term", 1:4, sep = ""))
return(co$pval)
}
if (type %in% c("one-sample", "two-sample")) {
fit <- limma::lmFit(dat, design.mat, weights = NULL)
nf <- sum(1/n[n != 0]) # one-smp: 1/n, two-smp: 1/nx + 1/ny
t.ord <- fit$coefficients[, ncol(fit$coefficients)]/(sqrt(nf)*fit$sigma)
if (m == 1) {
co <- .C("empEdgeOrd", dat = as.double(dat),
nc = as.integer(n), nr = as.integer(m), alpha = as.double(alpha),
onesmp = as.integer(type == "one-sample"),
side = as.character(side), ncheck = as.integer(ncheck),
lim = as.double(lim), unbmom = as.integer(unbiased.mom),
t.ord = as.double(t.ord), pval = as.double(rep(0, m*5)))
if (side == "two-sided") {
co$pval <- 2*co$pval
}
names(co$pval) <- c("t-dist", paste("term", 1:4, sep = ""))
return(co$pval)
}
fbay <- limma::ebayes(fit)
s20 <- fbay$s2.prior
d0 <- fbay$df.prior
if (is.infinite(d0)) {
co <- .C("empEdgeOrd", dat = as.double(dat),
nc = as.integer(n), nr = as.integer(m), alpha = as.double(alpha),
onesmp = as.integer(type == "one-sample"),
side = as.character(side), ncheck = as.integer(ncheck),
lim = as.double(lim), unbmom = as.integer(unbiased.mom),
t.ord = as.double(t.ord), pval = as.double(rep(0, m*5)))
tM <- fbay$t[, ncol(fbay$t)]
pM <- fbay$p.value[, ncol(fbay$p.value)]
if (side == "two-sided") {
co$pval <- 2*co$pval
} else {
pM <- pM/2
pM <- mapply(function(t, p, side) {
if((t < 0 & side == "left") | (t > 0 & side == "right")) {
return(p)
} else {
return(1 - p)
}}, tM, pM, side)
}
co$pval <- matrix(co$pval, nrow = m, ncol = 5)
colnames(co$pval) <- c("t-dist", paste("term", 1:4, sep = ""))
return(cbind(co$pval, "tM-dist" = pM))
}
co <- .C("empEdge", dat = as.double(dat),
nc = as.integer(n), nr = as.integer(m),
d0 = as.double(d0), s20 = as.double(s20), alpha = as.double(alpha),
onesmp = as.integer(type == "one-sample"),
side = as.character(side), ncheck = as.integer(ncheck),
lim = as.double(lim), unbmom = as.integer(unbiased.mom),
varpost = as.double(fbay$s2.post), t.ord = as.double(t.ord),
t.mod = as.double(fbay$t[, ncol(fbay$t)]),
pval = as.double(rep(0, m*5)), pvalM = as.double(rep(0, m*5)))
if (side == "two-sided") {
co$pval <- 2*co$pval
co$pvalM <- 2*co$pvalM
}
co$pval <- matrix(co$pval, nrow = m, ncol = 5)
co$pvalM <- matrix(co$pvalM, nrow = m, ncol = 5)
colnames(co$pval) <- c("t-dist", paste("term", 1:4, sep = ""))
colnames(co$pvalM) <- c("tM-dist", paste("termM", 1:4, sep = ""))
return(cbind(co$pval, co$pvalM))
} else {
stop("type is not recognized")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.