Nothing
# Original file .../src/library/stats/R/ks.test.R
# Part of the R package, http://www.R-project.org
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
#
# Modification by Taylor Arnold and Jay Emerson, 2010:
#
# Any changes to the original R Core source is noted by
# the following comment convension to aid code review:
#
# ####################
# ## TBA, JWE: comment
#
# ## END TBA, JWE
# ###############
ks.test <- function(x, y, ...,
alternative = c("two.sided", "less", "greater"),
exact = NULL,
#########################################################################
## TBA, JWE: an additional argument, tol, is used as an upper bound for
## possible rounding error in values (say, a and b) when needing to check
## for equality (a==b). We also use it in establishing a small epsilon
## for some other calculations involving the step functions.
## Additional arguments, simulate.p.value and B, follow the usage of
## fisher.test, but are only used for the discrete distribution case.
tol=1e-8,
simulate.p.value=FALSE, B=2000)
## END TBA, JWE
################
{
pkolmogorov1x <- function(x, n) {
## Probability function for the one-sided one-sample Kolmogorov
## statistics, based on the formula of Birnbaum & Tingey (1951).
if(x <= 0) return(0)
if(x >= 1) return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j)
+ (n - j) * log(1 - x - j / n)
+ (j - 1) * log(x + j / n)))
}
#############################################################
## TBA, JWE: exact.pval is a new function implementing the
## p-value calculations presented in Conover (1972) and
## Gleser (1985), which are cited in our revision of
## ks.test.Rd.
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y, tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
# The exact two-sided p-value from Gleser (1985)
# and Niederhausen (1981)
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n); b <- a; f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >= i/n + eps2)[1]],
Inf), na.rm=TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S > (i-1)/n - eps2)[1]],
Inf), na.rm=TRUE)
# Calculate F(a_i-)
# If a_i is not a knot of F_0,
# then simply return the value of F_0(a_i).
# Otherwise, evaluate F_0(a_i - eps)
f_a[i] <- ifelse(!(a[i] %in% knots.y),
y(a[i]), y(a[i]-eps))
}
f_b <- y(b)
# NOW HAVE f_a and f_b which are Niederhausen u, v, and this
# uses the Noe result.
p <- rep(1, n+1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i-1)) {
tmp <- tmp + choose(i, k) * (-1)^(i-k-1) *
max(f_b[k+1] - f_a[i], 0)^(i-k) * p[k+1]
}
p[i+1] <- tmp
}
p <- max(0, 1 - p[n+1]) # Niederhausen result
if (p>1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
# Conover
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n*(1-S))
c <- S+(1:m-1)/n
CDFVAL <- H(sort(z))
for(j in 1:length(c)) {
ifelse( (min(abs(c[j] - CDFVAL)) < tol),
c[j] <- 1-c[j],
c[j] <- 1-CDFVAL[which(order(c(c[j], CDFVAL)) == 1 )] )
}
b <- rep(0, m)
b[1] <- 1
for(k in 1:(m-1))
b[k+1] <- 1 - sum(choose(k,1:k-1)*c[1:k]^(k-1:k+1)*b[1:k])
p <- sum(choose(n, 0:(m-1))*c^(n-0:(m-1))*b )
return(p)
}
# Conover
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n*(1-S))
c <- 1-(S+(1:m-1)/n)
CDFVAL <- c(0,H(sort(z)))
for(j in 1:length(c)) {
if( !(min(abs(c[j] - CDFVAL)) < tol) )
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) == 1 ) - 1]
}
b <- rep(0, m)
b[1] <- 1
for(k in 1:(m-1))
b[k+1] <- 1 - sum(choose(k,1:k-1)*c[1:k]^(k-1:k+1)*b[1:k])
p <- sum(choose(n, 0:(m-1))*c^(n-0:(m-1))*b )
return(p)
}
p <- switch(alternative,
"two.sided" = ts.pval(STATISTIC, x, n, y, knots.y, tol),
"less" = less.pval(STATISTIC, n, y, knots.y, tol),
"greater" = greater.pval(STATISTIC, n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y, tol, B) {
# Simulate B samples from given stepfun y
fknots.y <- y(knots.y)
u <- runif(B*length(x))
u <- sapply(u, function(a) return(knots.y[sum(a>fknots.y)+1]))
dim(u) <- c(B, length(x))
# Calculate B values of the test statistic
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative,
"two.sided" = max(abs(dev)),
"greater" = max(dev),
"less" = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
# Produce the estimated p-value
return(sum(s >= STATISTIC-tol) / B)
}
## END TBA, JWE
###############
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if(n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if(is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n) # to avoid integer overflow
n.y <- length(y)
if(n.y < 1L)
stop("not enough 'y' data")
if(is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y / (n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y))
if(length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative,
"two.sided" = max(abs(z)),
"greater" = max(z),
"less" = - min(z))
nm_alternative <- switch(alternative,
"two.sided" = "two-sided",
"less" = "the CDF of x lies below that of y",
"greater" = "the CDF of x lies above that of y")
if(exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x",
p = as.double(STATISTIC),
as.integer(n.x),
as.integer(n.y),
PACKAGE = "stats")$p
#################################################################
## TBA, JWE: the following else implements behavior relating to
## a new possibility for y: an object of class 'stepfun', of
## which 'ecdf' is a subclass we expect most users to have.
## Don't recalculate knots(z) more than necessary.
} else if (is.stepfun(y)) {
z <- knots(y)
## JE NOTE: could allow larger n for two-sided case???
## revisit.
if(is.null(exact)) exact <- (n <= 30)
if (exact && n>30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative,
"two.sided" = max(abs(dev)),
"greater" = max(dev),
"less" = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC,
x, n, y, z, tol, B)
} else {
PVAL <- switch(exact,
"TRUE" = exact.pval(alternative, STATISTIC,
x, n, y, z, tol),
"FALSE" = NULL)
}
nm_alternative <- switch(alternative,
"two.sided" = "two-sided",
"less" = "the CDF of x lies below the null hypothesis",
"greater" = "the CDF of x lies above the null hypothesis")
## END TBA, JWE
###############
} else {
if(is.character(y))
y <- get(y, mode="function")
if(mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if(is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if(length(unique(x)) < n) {
##########################################################
## TBA, JWE: the following error message has been changed.
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
## END TBA, JWE
###############
TIES <- TRUE
}
x <- y(sort(x), ...) - (0 : (n-1)) / n
STATISTIC <- switch(alternative,
"two.sided" = max(c(x, 1/n - x)),
"greater" = max(1/n - x),
"less" = max(x))
if(exact && !TIES) {
PVAL <- if(alternative == "two.sided")
1 - .C("pkolmogorov2x",
p = as.double(STATISTIC),
as.integer(n),
PACKAGE = "stats")$p
else
1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative,
"two.sided" = "two-sided",
"less" = "the CDF of x lies below the null hypothesis",
"greater" = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative,
"two.sided" = "D",
"greater" = "D^+",
"less" = "D^-")
pkstwo <- function(x, tol = 1e-6) {
## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2}
## Not really needed at this generality for computing a single
## asymptotic p-value as below.
if(is.numeric(x))
x <- as.vector(x)
else
stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if(length(IND)) {
p[IND] <- .C("pkstwo",
as.integer(length(x[IND])),
p = as.double(x[IND]),
as.double(tol),
PACKAGE = "stats")$p
}
return(p)
}
if(is.null(PVAL)) {
## <FIXME>
## Currently, p-values for the two-sided two-sample case are
## exact if n.x * n.y < 10000 (unless controlled explicitly).
## In all other cases, the asymptotic distribution is used
## directly. But: let m and n be the min and max of the sample
## sizes, respectively. Then, according to Kim and Jennrich
## (1973), if m < n / 10, we should use the
## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80;
## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80.
PVAL <- ifelse(alternative == "two.sided",
1 - pkstwo(sqrt(n) * STATISTIC),
exp(- 2 * n * STATISTIC^2))
## </FIXME>
}
RVAL <- list(statistic = STATISTIC,
p.value = PVAL,
alternative = nm_alternative,
method = METHOD,
data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.