#' Wald-Wolfiwitz test
#'
#'
#' Part of code is imported from package \pkg{DescTools}.
#' The 2-sample runs test is known as the Wald-Wolfowitz test.
#' The runs test for randomness is used to test the hypothesis that a series of
#' numbers is random.
#'
#'
#'
#' Runs Test for Randomness
#'
#' Performs a test whether the elements of \code{x} are serially independent -
#' say, whether they occur in a random order - by counting how many runs there
#' are above and below a threshold. If \code{y} is supplied a two sample
#' Wald-Wolfowitz-Test testing the equality of two distributions against
#' general alternatives will be computed.
#'
#' The runs test for randomness is used to test the hypothesis that a series of
#' numbers is random. The 2-sample test is known as the Wald-Wolfowitz test.
#' \cr
#'
#' For a categorical variable, the number of runs correspond to the number of
#' times the category changes, that is, where \eqn{x_{i}}{x_i} belongs to one
#' category and \eqn{x_{i+1}}{x_(i+1)} belongs to the other. The number of runs
#' is the number of sign changes plus one.\cr
#'
#' For a numeric variable x containing more than two values, a run is a set of
#' sequential values that are either all above or below a specified cutpoint,
#' typically the median. This is not necessarily the best choice. If another
#' threshold should be used use a code like: \code{RunsTest(x > mean(x))}.
#'
#' The exact distribution of runs and the p-value based on it are described in
#' the manual of SPSS "Exact tests"
#' \url{http://www.sussex.ac.uk/its/pdfs/SPSS_Exact_Tests_21.pdf}.
#'
#' The normal approximation of the runs test is calculated with the expected
#' number of runs under the null \deqn{\mu_r=\frac{2 n_0 n_1}{n_0 + n_1} + 1}
#' and its variance \deqn{\sigma_r^2 = \frac{2 n_0 n_1 (2 n_0 n_1 - n_0 - n_1)
#' }{(n_0 + n_1)^2 \cdot (n_0 + n_1 - 1)}} as \deqn{\hat{z}=\frac{r - \mu_r +
#' c}{\sigma_r}} where \eqn{n_0, n_1} the number of values below/above the
#' threshold and \eqn{r} the number of runs.
#'
#' Setting the continuity correction \code{correct = TRUE} will yield the
#' normal approximation as SAS (and SPSS if n < 50) does it, see
#' \url{http://support.sas.com/kb/33/092.html}. The c is set to \eqn{c = 0.5}
#' if \eqn{r < \frac{2 n_0 n_1}{n_0 + n_1} + 1} and to \eqn{c = -0.5} if \eqn{r
#' > \frac{2 n_0 n_1}{n_0 + n_1} + 1}.
#'
#' @aliases RunsTest RunsTest.formula RunsTest.default
#' @param x a dichotomous vector of data values or a (non-empty) numeric vector
#' of data values.
#' @param y an optional (non-empty) numeric vector of data values.
#' @param formula a formula of the form \code{lhs ~ rhs} where \code{lhs} gives
#' the data values and rhs the corresponding groups.
#' @param data an optional matrix or data frame (or similar: see
#' \code{\link{model.frame}}) containing the variables in the formula
#' \code{formula}. By default the variables are taken from
#' \code{environment(formula)}.
#' @param subset an optional vector specifying a subset of observations to be
#' used.
#' @param na.action a function which indicates what should happen when the data
#' contain NAs. Defaults to \code{getOption("na.action")}.
#' @param alternative a character string specifying the alternative hypothesis,
#' must be one of \code{"two.sided"} (default), \code{"less"} or
#' \code{"greater"}.
#' @param exact a logical indicating whether an exact p-value should be
#' computed. By default exact values will be calculated for small vectors with
#' a total length <= 30 and the normal approximation for longer ones.
#' @param correct a logical indicating whether to apply continuity correction
#' when computing the test statistic. Default is \code{TRUE}. Ignored if
#' \code{exact} is set to \code{TRUE}. See details.
#' @param na.rm defines if \code{NA}s should be omitted. Default is
#' \code{FALSE}.
#' @param \dots further arguments to be passed to or from methods.
#' @return A list with the following components. \item{statistic}{z, the value
#' of the standardized runs statistic, if not exact p-values are computed.}
#' \item{parameter}{the number of runs, the total number of zeros (m) and ones
#' (n)} \item{p.value}{the p-value for the test.} \item{data.name}{a character
#' string giving the names of the data.} \item{alternative}{a character string
#' describing the alternative hypothesis.}
#' @author Andri Signorell <andri@@signorell.net>, exact p-values by Detlew
#' Labes <detlewlabes@@gmx.de>
#' @seealso Run Length Encoding \code{\link{rle}}
#' @references Wackerly, D., Mendenhall, W. Scheaffer, R. L. (1986):
#' \emph{Mathematical Statistics with Applications}, 3rd Ed., Duxbury Press,
#' CA.
#'
#' Wald, A. and Wolfowitz, J. (1940): On a test whether two samples are from
#' the same population, \emph{Ann. Math Statist}. 11, 147-162.
#' @keywords htest
#'
#' @export
#'
#' @examples
#'
#' # x will be coerced to a dichotomous variable
#' x <- c("S","S", "T", "S", "T","T","T", "S", "T")
#' RunsTest(x)
#'
#'
#' x <- c(13, 3, 14, 14, 1, 14, 3, 8, 14, 17, 9, 14, 13, 2, 16, 1, 3, 12, 13, 14)
#' RunsTest(x)
#' # this will be treated as
#' RunsTest(x > median(x))
#'
#' plot( (x < median(x)) - 0.5, type="s", ylim=c(-1,1) )
#' abline(h=0)
#'
#' set.seed(123)
#' x <- sample(0:1, size=100, replace=TRUE)
#' RunsTest(x)
#' # As you would expect of values from a random number generator, the test fails to reject
#' # the null hypothesis that the data are random.
#'
#'
#' # SPSS example
#' x <- c(31,23,36,43,51,44,12,26,43,75,2,3,15,18,78,24,13,27,86,61,13,7,6,8)
#' RunsTest(x)
#' RunsTest(x, exact=TRUE)
#'
#' # SPSS example small dataset
#' x <- c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1)
#' RunsTest(x)
#' RunsTest(x, exact=FALSE)
#'
#' # if y is not NULL, the Wald-Wolfowitz-Test will be performed
#' A <- c(35,44,39,50,48,29,60,75,49,66)
#' B <- c(17,23,13,24,33,21,18,16,32)
#'
#' RunsTest(A, B, exact=TRUE)
#' RunsTest(A, B, exact=FALSE)
#'
# ============================================================================
wald_wolfowitz_test <- function(x, ...) {
UseMethod("wald_wolfowitz_test")
}
# ============================================================================
#' @rdname wald_wolfowitz_test
#' @export
wald_wolfowitz_test.formula <- function(x, data = NULL,
alternative = "two.sided",
exact = NULL, correct = TRUE,
na.rm = FALSE, ...)
{
if (is.null(data)) {
data <- rlang::f_env(x)
}
data <- model.frame(x, data)
# dname <- deparse(substitute(x))
wald_wolfowitz_test(data[[1]], data[[2]],
alternative = alternative,
exact = exact,
correct = correct,
na.rm = na.rm,
...,
# dname = dname
)
}
# ============================================================================
#' @rdname wald_wolfowitz_test
#' @export
wald_wolfowitz_test.default <- function(x, group,
alternative = "two.sided",
exact = NULL, correct = TRUE, na.rm = FALSE, ...)
{
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if (na.rm) {
ind_ok <- complete.cases(x, group)
x <- x[ind_ok]
group <- group[ind_ok]
}
alternative <- match.arg(alternative)
# if (is.null(as.list(...)$dname))
dname <- paste(deparse(substitute(x)), "and", deparse(substitute(group)))
if (!nlevels(group) %in% c(1, 2))
stop("Can only process dichotomous variables")
levs <- levels(group)
m <- sum(group == levs[1])
n <- sum(group == levs[2])
if (is.null(exact)) exact <- ((m + n) <= 30)
# Calculate runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# "two.sided"
alternative <- "true number of runs is not equal to the expected number"
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
split_list <- split(group, x)
# Rezult is list:
rez_df <- expand.grid(lapply(split_list, get_permutations))
# Convert to matrix:
mat <- apply(rez_df, 1, unlist)
# Calculte number of runs for each column of matrix.
# Select only minimum and maximum possible number of runs.
runs <- unique(min_max(apply(mat, 2, calculate_runs)))
res <- sapply(runs, two_sided_pval_for_runs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
method <- paste0("Wald-Wolfowitz Runs Test (",
ifelse(exact, "exact", "asymptotic"),
ifelse(!exact && correct, " with continuity correnction", ""),
")")
if (length(runs) == 1) {
structure(list(statistic = unlist(res["statistic", 1]),
p.value = unlist(res["p.value", 1]),
method = method,
alternative = alternative,
data.name = dname,
estimate = NULL,
parameter = c(runs = runs[1], n1 = m, n2 = n)),
class = c("wwtest", "htest"))
} else {
list(runs_min =
structure(list(statistic = unlist(res["statistic", 1]),
p.value = unlist(res["p.value", 1]),
method = paste(method,
" \nNOTE: ties exist, p values for MINIMUM",
"possible number of runs calculated."),
alternative = alternative,
data.name = dname,
estimate = NULL,
parameter = c(runs = runs[1], m = m, n = n)),
class = c("wwtest", "htest")),
runs_max =
structure(list(statistic = unlist(res["statistic", 2]),
p.value = unlist(res["p.value", 2]),
method = paste(method,
" \nNOTE: ties exist, p values for MAXIMUM",
"possible number of runs calculated."),
alternative = alternative,
data.name = dname,
estimate = NULL,
parameter = c(runs = runs[2], m = m, n = n)),
class = c("wwtest", "htest"))
)
}
}
# ============================================================================
get_permutations <- function(x) {
# Get all unique permutations of input vector (`x`) values.
n <- length(x)
# All possible combinations of elements
ve <- as.matrix(expand.grid(lapply(1:n, function(i){1:n})))
# Combinations, in which the same object does not repeat
ind_mat_ok <- apply(ve, 1, function(i) all(1:n %in% i))
# Permutations of indices
dfi <- as.data.frame(t(ve[ind_mat_ok, ]))
# Permutations of original inputs (with duplication)
x_pernutations <- lapply(dfi, function(i) x[i])
# Remove duplications
x_pernutations[!duplicated(x_pernutations)]
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pruns <- function(r, n1, n2, alternative = c("two.sided", "less", "greater")) {
.druns_nom <- function(r, n1, n2) {
pp <- vector(mode = "numeric", length = length(r))
for (i in seq_along(r)) {
if (2 * r[i] %/% 2 == r[i]) {
k <- r[i]/2
pp[i] <- 2 * choose(n1 - 1, k - 1) * choose(n2 - 1, k - 1)
} else {
k <- (r[i] - 1)/2
pp[i] <- choose(n1 - 1, k - 1) * choose(n2 - 1, k) +
choose(n1 - 1, k) * choose(n2 - 1, k - 1)
}
}
pp
}
alternative <- match.arg(alternative)
n <- n1 + n2
if (r <= 1) stop("Number of runs must be > 1")
if (r > n) stop("Number of runs must be < (n1 + n2")
if (n1 < 1 | n2 < 1) return(0)
E <- 1 + 2 * n1 * n2/n
denom <- choose(n, n1)
rmax <- ifelse(n1 == n2, 2 * n1, 2 * min(n1, n2) + 1)
rv <- 2:rmax
pp <- .druns_nom(rv, n1, n2)
pL <- sum(pp[rv <= r])/denom
pU <- 1 - sum(pp[rv <= (r - 1)])/denom
p2 <- sum(pp[abs(rv - E) >= abs(r - E)])/denom
p2min <- 2 * min(c(pL, pU, 0.5))
return(switch(alternative, less = pL, greater = pU, two.sided = p2))
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
calculate_runs <- function(g) {
# Calculate number of runs in the vector g.
# g - two groups factor
g <- factor(g)
g <- as.numeric(g) - 1
runs <- sum(diff(g) != 0) + 1
runs
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
two_sided_pval_for_runs <- function(runs) {
# two_sided_pval_for_runs <- function(runs, m, n, exact, correct) {
# Calculate p value and related statistics for two-tailed
# Wald-Wolfowitz test.
#
# runs - (integer) number of runs.
# m, n (integer) sample sizes for both groups
# exact (logical) Should eact test be used
# correct (logical) Sould continuity correction be used for
# asymptotic (non-exact test)
if (exact == TRUE) {
# Exact text ------------------------------------------------------
p.value <- pruns(runs, m, n, alternative = "two.sided")
statistic <- NULL
} else {
# Asymptotic test -------------------------------------------------
E <- 1 + 2 * n * m/(n + m)
s2 <- (2 * n * m * (2 * n * m - n - m))/((n + m)^2 * (n + m - 1))
if (correct) {
switch(as.character(cut(runs - E,
breaks = c(-Inf, -0.5, 0.5, Inf),
labels = c("a", "b", "c"))),
a = statistic <- (runs - E + 0.5)/sqrt(s2),
b = statistic <- 0,
c = statistic <- (runs - E - 0.5)/sqrt(s2))
} else {
statistic <- (runs - E)/sqrt(s2)
}
p.value <- 2 * min(pnorm(statistic), 1 - pnorm(statistic))
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Return
list(p.value = p.value,
statistic = statistic)
}
# ============================================================================
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.