#' Welch T-tests for rows of a matrix
#'
#' @param X A \code{m x n} numeric matrix whose rows correspond to variables
#' and columns to observations
#'
#' @param categ Either a numeric vector of \code{n} categories in \eqn{0, 1} for
#' the observations, or a \code{n x B} matrix stacking \code{B} such vectors
#' (typically permutations of an original vector of size \code{n})
#'
#' @param alternative A character string specifying the alternative hypothesis.
#' Must be one of "two.sided" (default), "greater" or "less". As in
#' \code{\link{t.test}}, alternative = "greater" is the alternative that class
#' 1 has a larger mean than class 0.
#'
#' @return A list containing the following components:
#' \describe{
#' \item{statistic}{the value of the t-statistics}
#' \item{parameter}{the degrees of freedom for the t-statistics}
#' \item{p.value}{the p-values for the tests}
#' \item{estimate}{the mean difference between groups}}
#' Each of these elements is a matrix of size \code{m x B}, coerced to a vector of length \code{m} if \code{B=1}
#'
#' @author Pierre Neuvial
#'
#' @details This function performs \code{m x B} Welch T tests on
#' \code{n} observations using matrix operations. Its time complexity is
#' \code{O(mBn)}. The code is much faster than using loops of 'apply'
#' functions, especially for high-dimensional problems (small n and large m)
#' because the overhead of the call to the 't.test' function is avoided and
#' the code is vectorized
#'
#' @references B. L. Welch (1951), On the comparison of several mean values: an
#' alternative approach. Biometrika, *38*, 330-336
#'
#' @export
#'
#' @examples
#'
#' m <- 300
#' n <- 38
#' mat <- matrix(rnorm(m * n), ncol = n)
#' categ <- rep(c(0, 1), times = c(27, n - 27))
#' system.time(fwt <- rowWelchTests(mat, categ, alternative = "greater"))
#' str(fwt)
#'
#' # compare with ordinary t.test:
#' system.time(pwt <- apply(mat, 1, FUN = function(x) {
#' t.test(x[categ == 1], x[categ == 0], alternative = "greater")$p.value
#' }))
#' all(abs(fwt$p.value - pwt) < 1e-10) ## same results
#'
#' # with several contrasts/permutations
#' B <- 100
#' categ_perm <- replicate(B, sample(categ))
#' system.time(fwt_perm <- rowWelchTests(mat, categ_perm, alternative = "greater"))
#' str(fwt_perm)
#'
rowWelchTests <- function(X, categ,
alternative = c("two.sided", "less", "greater")) {
alternative <- match.arg(alternative)
stopifnot(all(categ %in% c(0, 1)))
categ0 <- as.matrix(categ)
categ1 <- 1 - categ0
nX <- as.matrix(colSums(categ0))
nY <- as.matrix(colSums(categ1))
sumX <- X %*% categ0
sumY <- X %*% categ1
XX <- X * X
sum2X <- XX %*% categ0
sum2Y <- XX %*% categ1
mX <- divide_cols(sumX, nX)
mY <- divide_cols(sumY, nY)
num <- sum2X - divide_cols(sumX^2, nX)
num <- abs(num) # fix rare corner cases where num ~= -1e-15
sX <- sqrt(divide_cols(num, nX - 1))
num <- sum2Y - divide_cols(sumY^2, nY)
num <- abs(num) # fix rare corner cases where num ~= -1e-15
sY <- sqrt(divide_cols(num, nY - 1))
suffWelchTests(mX, mY, sX, sY, nX, nY, alternative = alternative)
}
#' Welch test from sufficient statistics
#'
#' @param mx A numeric value or vector, the sample average for condition "x"
#' @param my A numeric value or vector of the same length as 'mx', the sample
#' average for condition "y"
#' @param sx A numeric value or vector of the same length as 'mx', the standard
#' deviation for condition "x"
#' @param sy A numeric value or vector of the same length as 'mx', the standard
#' deviation for condition "y"
#' @param nx A numeric value or vector of the same length as 'mx', the sample
#' size for condition "x"
#' @param ny A numeric value or vector of the same length as 'mx', the sample
#' size for condition "y"
#' @param alternative A character string specifying the alternative hypothesis.
#' @return A list with elements:
#' \describe{
#' \item{statistic}{the value of the t-statistic}
#' \item{parameter}{the degrees of freedom for the t-statistic}
#' \item{p.value}{the p-value for the test}
#' }
#' @author Pierre Neuvial
#'
#' @details In accordance with the implementation of \code{\link[stats]{t.test}} and friends, we
#' follow the rule that 'alternative = "greater"' is the alternative that 'x'
#' has a larger mean than 'y'.
#'
#' @noRd
#' @importFrom stats pt
#' @examples
#'
#' # Reproducing the results of the 't.test' function
#'
#' x <- rnorm(100)
#' y <- rnorm(34, mean = 1)
#' target <- t.test(x, y)
#' swt <- suffWelchTests(mean(x), mean(y), sd(x), sd(y), length(x), length(y))
#' all.equal(swt$statistic, target$statistic, check.attributes = FALSE)
#' all.equal(swt$p.value, target$p.value, check.attributes = FALSE)
#' all.equal(swt$parameter, target$parameter, check.attributes = FALSE)
#'
#' target <- t.test(x, y, alternative = "greater")
#' swt <- suffWelchTests(mean(x), mean(y), sd(x), sd(y),
#' length(x), length(y),
#' alternative = "greater"
#' )
#' all.equal(swt$statistic, target$statistic, check.attributes = FALSE)
#' all.equal(swt$p.value, target$p.value, check.attributes = FALSE)
#' all.equal(swt$parameter, target$parameter, check.attributes = FALSE)
#'
suffWelchTests <- function(mx, my, sx, sy, nx, ny,
alternative = c("two.sided", "less", "greater")) {
alternative <- match.arg(alternative)
if (is.vector(mx)) {
if (!all(
is.vector(my), is.vector(sx), is.vector(sy),
is.vector(nx), is.vector(ny)
)) {
stop("All numeric inputs should be of the same type (either vector or matrix)")
}
mx <- as.matrix(mx)
my <- as.matrix(my)
sx <- as.matrix(sx)
sy <- as.matrix(sy)
nx <- as.matrix(nx)
ny <- as.matrix(ny)
} else if (is.matrix(mx)) {
if (!all(
is.matrix(my), is.matrix(sx), is.matrix(sy),
is.matrix(nx), is.matrix(ny)
)) {
stop("All numeric inputs should be of the same type (either vector or matrix)")
}
} else {
stop("Numeric inputs should be vectors or matrices")
}
sse.x <- divide_cols(sx^2, nx) ## sc <-> sqrt((sum2c - sumc^2/nc)/(nc-1))
sse.y <- divide_cols(sy^2, ny)
sse <- sse.x + sse.y
sse2 <- sse^2
## test statistic
stat <- (mx - my) / sqrt(sse)
## names(stat) <- rep("t", length(stat))
## approximate degrees of freedom (Welch-Satterthwaite)
deno <- divide_cols(sse.x^2, nx - 1) + divide_cols(sse.y^2, ny - 1)
df <- sse2 / deno
## names(df) <- "df"
# mean difference
meanDiff <- mx - my
# coerce to vector if single column
if (ncol(stat) == 1) {
stat <- stat[, 1]
stopifnot(ncol(df) == 1)
df <- df[, 1]
stopifnot(ncol(meanDiff) == 1)
meanDiff <- meanDiff[, 1]
}
## p-value
pval <- switch(alternative,
"two.sided" = 2 * (1 - pt(abs(stat), df = df)),
"greater" = 1 - pt(stat, df = df),
"less" = pt(stat, df = df)
)
list(
statistic = stat,
parameter = df,
p.value = pval,
estimate = meanDiff
)
}
divide_cols <- function(a, b) sweep(a, 2, b, "/")
categCheck <- function(categ, n) {
name <- as.character(substitute(categ))
if (length(categ) != n) {
stop(name, " should be of length ", n, ", not", length(categ))
}
categ <- as.factor(categ)
cats <- levels(categ)
if (!identical(cats, c("0", "1")) & length(cats) <= 2) {
stop("'", name, "' should consist only of '0' and '1' or distinct continuous values.")
}
}
#' Welch test from sufficient statistics
#'
#' @param mx A numeric value or vector, the sample average for condition "x"
#' @param my A numeric value or vector of the same length as 'mx', the sample
#' average for condition "y"
#' @param sx A numeric value or vector of the same length as 'mx', the standard
#' deviation for condition "x"
#' @param sy A numeric value or vector of the same length as 'mx', the standard
#' deviation for condition "y"
#' @param nx A numeric value or vector of the same length as 'mx', the sample
#' size for condition "x"
#' @param ny A numeric value or vector of the same length as 'mx', the sample
#' size for condition "y"
#' @param alternative A character string specifying the alternative hypothesis.
#' Currently only "two.sided" is implemented
#' @return A list with elements:
#' \describe{ \item{statistic}{the value of the t-statistic}
#' \item{parameter}{the degrees of freedom for the t-statistic}
#' \item{p.value}{the p-value for the test} }
#' @author Pierre Neuvial
#'
#' @details In accordance with the implementation of \code{\link[stats]{t.test}} and friends, we
#' follow the rule that 'alternative = "greater"' is the alternative that 'x'
#' has a larger mean than 'y'.
#'
#' @importFrom stats pt
#' @noRd
#' @examples
#' # Reproducing the results of the 't.test' function
#'
#' x <- rnorm(100)
#' y <- rnorm(34, mean = 1)
#' target <- t.test(x, y)
#' swt <- suffWelchTests1(mean(x), mean(y), sd(x), sd(y), length(x), length(y))
#' all.equal(swt$statistic, target$statistic, check.attributes = FALSE)
#' all.equal(swt$p.value - target$p.value, check.attributes = FALSE)
#' all.equal(swt$parameter - target$parameter, check.attributes = FALSE)
#'
#' target <- t.test(x, y, alternative = "greater")
#' swt <- suffWelchTests1(mean(x), mean(y), sd(x), sd(y),
#' length(x), length(y),
#' alternative = "greater"
#' )
#' all.equal(swt$statistic, target$statistic, check.attributes = FALSE)
#' all.equal(swt$p.value, target$p.value, check.attributes = FALSE)
#' all.equal(swt$parameter, target$parameter, check.attributes = FALSE)
#'
#' @noRd
suffWelchTests1 <- function(mx, my, sx, sy, nx, ny,
alternative = c("two.sided", "less", "greater")) {
alternative <- match.arg(alternative)
# if (alternative != "two.sided") stop("altenative", alternative, "not implemented yet!")
## sanity checks
p <- length(mx)
stopifnot(length(my) == p)
stopifnot(length(sx) == p)
stopifnot(length(sy) == p)
stopifnot(length(nx) %in% c(1, p))
stopifnot(length(ny) %in% c(1, p))
sse.x <- sx^2 / nx ## sc <- sqrt((sum2c - sumc^2/nc)/(nc-1))
sse.y <- sy^2 / ny
sse <- sse.x + sse.y
sse2 <- sse^2
## test statistic
stat <- (mx - my) / sqrt(sse)
## names(stat) <- rep("t", length(stat))
## approximate degrees of freedom (Welch-Satterthwaite)
df <- sse2 / (sse.x^2 / (nx - 1) + sse.y^2 / (ny - 1))
## names(df) <- "df"
## p-value
pval <- switch(alternative,
"two.sided" = 2 * (1 - pt(abs(stat), df = df)),
"greater" = 1 - pt(stat, df = df),
"less" = pt(stat, df = df)
)
list(
statistic = stat,
parameter = df,
p.value = pval,
estimate = mx - my
)
}
rowWelchTests1 <- function(mat, categ, alternative = c("two.sided", "less", "greater")) {
alternative <- match.arg(alternative)
categCheck(categ, ncol(mat))
sstats <- getSummaryStats(mat, categ = categ)
Y <- sstats[["0"]]
X <- sstats[["1"]] ## as per the doc of t.test:
## 'alternative = "greater"' is the alternative that 'x' has a larger mean
## than 'y'.
swt <- suffWelchTests1(X[["mean"]], Y[["mean"]],
X[["sd"]], Y[["sd"]],
X[["n"]], Y[["n"]],
alternative = alternative
)
swt
}
#' Convert a matrix of observations into summary statistics
#'
#' Convert a matrix of observations labelled into categories into summary
#' statistics for each category
#'
#' The following statistics are calculated: sums, sums of squares, means,
#' standard deviations, sample sizes
#'
#' @param mat A \code{m x n} numeric matrix whose rows correspond to variables
#' and columns to observations
#'
#' @param categ A vector of \code{n} categories for the observations
#'
#' @return A list of \code{n} elements containing the above-described
#' summary statistics for each category
#'
#' @author Pierre Neuvial
#'
#' @noRd
#' @examples
#'
#' mat <- matrix(rnorm(3051 * 38), ncol = 38)
#' cls <- rep(c(0, 1), times = c(27, 11))
#' stats <- getSummaryStats(mat, categ = cls)
getSummaryStats <- function(mat, categ) {
stopifnot(ncol(mat) == length(categ))
cats <- sort(unique(categ))
res <- list()
for (cc in seq(along = cats)) {
ww <- which(categ == cats[cc])
matc <- mat[, ww, drop = FALSE]
sumc <- rowSums(matc)
sum2c <- rowSums(matc^2)
nc <- length(ww)
mc <- sumc / nc
sc <- sqrt((sum2c - sumc^2 / nc) / (nc - 1))
res[[cc]] <- list(
sum = sumc,
sum2 = sum2c,
n = nc,
mean = mc,
sd = sc
)
}
names(res) <- cats
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.