## based on the following documents from 2014/2015: ## - QQandKSnew.html (R source lost) ```r ## package 'psistat' is discussed further below, so don't echo it here. library(psistat) set.seed(1234)
?ks.test ?ecdf ?qqnorm ?qqline
Package psistat provides some useful functions for this topic. You can install it on your own computers from the web page of the course (if you have not done this yet).
library(psistat)
The names of the functions in "psistat" start with "psi.". You can learn more about them using the help system. For example:
apropos("psi.*") help(package="psistat") package ? psistat ?psi.eqf ?psi.Dn ?psi.lks.exp.test ?psi.jitter
library(nortest) # for Lilliefors test ?lillie.test ```r library(e1071) # for probplot ?probplot
Dataset Oxboys is in package "mlmRev"
library(mlmRev) # ?Oxboys
Miscellaneous: use this command to plot in a new window:
dev.new()
Some random samples for the following examples.
x20 <- rnorm(20) # a short random sample, to show it on screen x200 <- rnorm(200) # larger x500 <- rnorm(500) # even larger x500ms <- rnorm(500, mean = 1, sd = 3) x2 <- rchisq(100, df = 2) xe <- rexp(500, rate = 1/1500) # life times of electric bulbs
x20.ecdf <- ecdf(x20) # its ecdf x20.ecdf
x20.ecdf is a function - we can use it as any other R function:
x20.ecdf(0) x20.ecdf(10)
Here we plot the ecdf and overlay the cdf used to simulate the data. The sample is small, so the two do not match very closely:
plot(x20.ecdf) curve(pnorm, add = TRUE, col = "blue")
At the places were the ecdf jumps, the value is the one after the jump (indicated by the filled points). Here is a histogram with overlayed pdf for comparison.
hist(x20, freq = FALSE, ylim = c(0, 0.5)) curve(dnorm, add = TRUE)
For larger samples the ecdf is very good estimator of the underlying cdf:
x200.ecdf <- ecdf(x200) plot(x200.ecdf) curve(pnorm, add = TRUE, col = "red")
... and an even larger sample:
x500.ecdf <- ecdf(x500) # its ecdf plot(x500.ecdf) # ecdf and cdf overlayed curve(pnorm, add = TRUE, col = "red") ## This shows how to add the curve with non-default values of the ## arguments. This simply illustrates the technique: norm2 <- function(x) pnorm(x, mean = 2) curve(norm2, add = TRUE, col = "blue")
# ?quantile quantile(x500, probs=0.5) quantile(x500, probs= c(0.25,0.5, 0.75)) quantile(x20, 0.5) hist(x20, freq=FALSE, ylim = c(0, 0.5)) curve(dnorm, from = -3, to = 3, add = TRUE, col = "blue")
Empirical quantile function (from package "psistat")
?psi.eqf
The eqf of a small sample:
x20.eqf <- psi.eqf(x20) plot(x20.eqf, main = "Eqf of x20") curve(qnorm, add = TRUE, col = "blue") # plot(x200) x200ecdf <- ecdf(x200) x200ecdf(0) quantile(x200, c(.25, .5, 0.75)) x200eqf <- psi.eqf(x200) x200eqf(0.5) x200eqf(0.25) plot(x200eqf, main = "Eqf of x200") x500.eqf <- psi.eqf(x500) # a larger sample plot(x500.eqf, main = "Eqf of x500") curve(qnorm, add = TRUE, col = "blue") ```r ## #' Illustrate that quantiles can be plotted even without quantile f. (The "Weekend fun" ## #' question that stood for several weeks.) ## y500 <- pnorm(x500) ## plot(sort(x500), sort(y500)) # plot a cdf ## ## plot(sort(y500), sort(x500)) # plot the inverse cdf without using quantile function
We follow the procedure given in the notes: prepare points p, evaluate the scores s, compute the order statistics and plot. Does x20 come from N(0,1)?
n <- length(x20) n p <- (1:n - 0.5) # to check that it gives 0.5, 1.5, ..., 19.5 p p <- (1:n - 0.5)/n p s <- qnorm(p) # scores x20.sorted <- sort(x20) # order statistics plot(s, x20.sorted) # qq-plot abline(0, 1, col = "blue") # plot the reference line
Does x2 come from N(0,1)?
n <- length(x2) p <- ((1:n) - 0.5) / n s2 <- qnorm(p) x2s <- sort(x2) plot(s2, x2s) abline(0, 1, col = "red")
Does x200 come from N(0,1)?
n <- length(x200) p <- ((1:n) - 0.5) / n si <- qnorm(p) x200.sorted <- sort(x200) plot(si, x200.sorted) abline(0, 1, col = "blue")
Does x200 come from Student-t with 3 d.f.?
si <- qt(p, df = 3) plot(si, x200.sorted) abline(0,1, col = "blue")
Does x500 come from N(0,1)?
n <- length(x500) # x500 p <- (1:n - 0.5)/n s <- qnorm(p) x500.sorted <- sort(x500) plot(s, x500.sorted) abline(0,1) # various ways to overlay a line lm(x500.sorted ~ s) abline(lm(x500.sorted ~ s), col = "red") qqline(x500.sorted)
qq-plots for x500 The following examples are of qq-plots for x500 with various non-matching distributions.
s <- qt(p, df = 3) # H0 is t_3 dist plot(s, x500.sorted) abline(0, 1, col="blue") s <- qnorm(p, mean = 5) # H0 is N(5,1) plot(s,x500.sorted) abline(0, 1, col = "blue") s <- qnorm(p, mean = 5, sd = 2) # H0 is N(5,2^2) plot(s, x500.sorted) abline(0, 1, col = "blue") x500ms.sorted <- sort(x500ms) p <- ((1:500) - 0.5)/500 s <- qnorm(p) plot(s, x500ms.sorted) abline(0, 1, col = "blue") lmfit500 <- lm(x500ms.sorted ~ s) summary(lmfit500) abline(lmfit500, col = "red")
qq-plot example with an exponential distribution. First generate some data to work with: xe might represent lifetime of bulbs (incandescent have typical expected life equal to 1500 hours).
hist(xe) n <- length(xe) p <- ((1:n) - 0.5) / n s <- qexp(p, rate = 1/1500) plot(s, sort(xe))
qqnorm produces a normal qq-plot, i.e. a qq-plot for hypothesised normal distribution:
qqnorm(x500ms) abline(0,1) qqnorm(x500) # qqnorm qqnorm(xe) # no match here
probplot() is another function for qq-plots. Without further arguments it produces a normal plot:
library(e1071, quiet = TRUE) # for probplot probplot(x500)
For other hypothesised distributions the relevant quantile function should be given. We often need to define the function ourselves.
# # this doesn't work since probplot insists on naming the argument 'p' # qexp1500 <- function(x) qexp(x, rate=1/1500) # # here we oblige.
This function computes quantiles for exponential distribution with mean 1500:
qexp1500 <- function(p) qexp(p, rate = 1/1500)
(Note that probplot insists that the first argument is called 'p'.) Compare:
qexp1500(0.5) qexp(0.5, rate=1/1500)
Create the plot:
probplot(xe, qexp1500) x2a <- rnorm(500, mean=5, sd=2) qqnorm(x2a) mean(x2a) sd(x2a)
example: bulbs.txt
bulbs <- scan("bulbs.txt") bulbs qexp1500 <- function(p){ qexp(p, rate=1/1500) } probplot(bulbs, qdist = qexp1500) e100 <- rexp(100, rate=1/1500) p <- ((1:100) - 0.5)/100 s <- qexp(p, rate=1/1500) e100s <- sort(e100) plot(s, e100s) abline(0,1) e1000 <- rexp(1000, rate=1/1500) p <- ((1:1000) - 0.5)/1000 s <- qexp(p, rate=1/1500) e1000s <- sort(e1000) plot(s, e1000s) abline(0,1) s <- qnorm(p) plot(s, e1000s) library(e1071) probplot(e1000, qexp) probplot(e1000, qnorm) # ?qexp qchisq5 <- function(p) qchisq(p, df=5) probplot(e1000,qchisq5)
Approx mean and variance of order statistics example in Notes, p. 69
first using formulae on pp. 68-69
qnorm(4/(19+1)) ex <- qnorm(4/(19+1)) ex exvar <- 4*(19-4+1)/((19+1)^2*(19+2))/dnorm(ex)^2 exvar
... then via simulation
y <- numeric(1000) y[1] <- sort(rnorm(19))[4] y[2] <- sort(rnorm(19))[4]
and so on until y[1000] ... :)
# ... but better use a single command for(i in 1:1000) y[i] <- sort(rnorm(19))[4] hist(y, freq=FALSE) # estimate density mean(y) # estimate mean var(y) # estimate variance
this is an alternative to the 'for' loop (explain "replicate")
y <- replicate(1000, sort(rnorm(19))[4]) mean(y) var(y) quantile(y, c(0.25,0.5,0.75)) N <- 1000 x4 <- numeric(N) x4[1] <- sort(rnorm(19))[4] x4[1] x4[2] <- sort(rnorm(19))[4] # ... and so on; let's do it with a single command: for(i in 1:N) x4[i] <- sort(rnorm(19))[4]
Explore the distribution of the fourth order statistic:
mean(x4) var(x4) hist(x4) x4ecdf <- ecdf(x4) curve(x4ecdf, from = -2, to = 2) plot(x4ecdf) curve(pnorm, add = TRUE, col = "red")
DIY generation of a random sample from distribution Expo(1/2) First generate a sample from the uniform distribution.
u <- runif(8) u
Evaluate the quantiles of the required distribution (Expo(1/2) here) for the values in the U(0,1) random sample:
y <- -2*log(1-u) # DIY quantile function of Expo(1/2) y y1 <- qexp(u, rate = 1/2) # built-in quantile function y1
The results are the same (so, qexp uses the formula -log(1-u)/lambda):
all(y == y1)
u <- runif(100) x <- -1/2*log(1-u) ks.test(x, "pexp", rate=2) ks.test(x, "pexp", rate=10) pexp2 <- function(x) pexp(x, rate = 2) ks.test(x, pexp2) psi.Dn(x, pexp2) x <- runif(10) x y <- -1/2*log(1-x) y y1 <- qexp(x, rate=2) y1 all(y==y1) # TRUE (so, qexp uses the above formula
The cdf of the test statistic in KS test See examples for psi.pks.
# ?psi.pks psi.pks(0.6239385,4) psi.pks(0.2940753,20) psi.pks(0.1340279,100) psi.pks(0.04294685,1000)
Compare the distribution of the test statistic for various sample sizes:
xi <- seq(0,1,length=100) # some x values plot(xi,psi.pks(xi,4)) lines(xi,psi.pks(xi,4)) # cdf of D_4 lines(xi,psi.pks(xi,50),col="blue") # overlay the cdf of D_{50} lines(xi,psi.pks(xi,100),col="red") # overlay the cdf of D_{100} abline(h=0.95, col="brown") lines(xi,psi.pks(xi,1000),col="green") # overlay the cdf of D_{1000}
The abscissa of the intersection of the brown line with each of the curves gives the corresponding critical value of the KS test. Notice that for larger samples the critical value is smaller. In other words, smaller deviations from the hypothesised distribution function are considered significant. Similar to above using curve():
curve(psi.pks(x,4), from=0, to=1) # cdf of D_4 curve(psi.pks(x,10), from=0, to=1, col="blue", add=TRUE) # cdf of D_10 curve(psi.pks(x,50), from=0, to=1, col="brown", add=TRUE) # cdf of D_50 curve(psi.pks(x,100), from=0, to=1, col="red", add=TRUE) # cdf of D_100 curve(psi.pks(x,1000), from=0, to=1, col="green", add=TRUE) # cdf of D_1000 abline(h=0.95, col="brown")
x10 <- rnorm(10)
diy Dn for H_0 = cdf of N(0,1)
u10 <- pnorm(x10) u10 u10 <- sort(pnorm(x10)) u10 n <- 10 x10Dn <- max((1:n)/n - u10, u10 - (0:(n-1)/n)) x10Dn
now use psi.Dn to compute Dn, should give the same result.
psi.Dn(x10) # KS test # # ?psi.Dn x psi.Dn(x)
The value of the statistic is the same as that from ks.test:
ks.test(x, pnorm) psi.Dn(x) == ks.test(x, pnorm)$statistic pe <- function(x) pexp(x, rate=1/1500) ks.test(x, pe)
KS test 2013/2014 chunk
x20 x20os <- sort(x20) pnorm(x20os, mean=3, sd=2) pnorm(x20os, mean=3, sd=2) - (0:(20-1))/n wrk1 <- pnorm(x20os, mean=3, sd=2) - (0:(20-1))/20 wrk2 <- (1:20)/20 - pnorm(x20os, mean=3, sd=2) max(wrk1,wrk2) Dn.x20 <- max(wrk1,wrk2) Dn.x20 psi.Dn(x20) # ?psi.Dn psi.Dn(x20, cdf=pnorm, mean=3, sd=2) ks.test(x20, pnorm, mean=3, sd=2) # ?psi.pks psi.pks(0.5, n=20)
# # file.show("migraine.txt") datamig <- scan("migraine.txt") datamig ## [1] 98 90 155 86 80 84 70 128 93 40 108 90 130 48 55 106 145 ## [18] 126 100 115 75 95 38 66 63 32 105 118 21 142 ks.test(datamig, pnorm, mean=90, sd=35) ks.test(unique(datamig), pnorm, mean=90, sd=35) psi.Dn(unique(datamig), pnorm, mean=90, sd=35)
Example from help page of psi.pks
xi <- seq(0,1,length=100) # some x values plot(xi, psi.pks(xi,4)) # cdf of D_4
diy Dn
x <- unique(datamig) x xs <- sort(x) xs n <- length(xs) max( (1:n)/n - pnorm(xs,mean=90,sd=35), pnorm(xs,mean=90,sd=35) - (0:(n-1))/n)
Dn using psi.Dn
psi.Dn(unique(datamig), pnorm, mean=90, sd=35) ks.test(unique(datamig),pnorm, mean=90, sd=35) migDn <- max( (1:n)/n - pnorm(xs,mean=90,sd=35), pnorm(xs,mean=90,sd=35) - (0:(n-1))/n) migDn psi.pks(migDn, n) 1 - psi.pks(migDn, n)
apropos("psi.") ?psi.pkls.exp ?psi.plks.exp ?psi.qlks.exp
This package provides lillie.test():
library(nortest) # ?lillie.test lillie.test(x20)
# file.show("mothsontrees.txt") datamoths <- scan("mothsontrees.txt") datamoths # ?punif ks.test(datamoths, punif, min=0, max=25)
Alternatively:
mycdf <- function(q) punif(q, min=0, max=25) ks.test(datamoths, mycdf) mothssorted <- sort(datamoths) val <- punif(mothssorted, min=0, max=25) length(datamoths) n <- 14 (1:n)/n (1:n)/n - val val - (0:(n-1))/n max( (1:n)/n - val, val - (0:(n-1))/n ) ks.test(datamoths, "punif", min=0, max=25) f1 <- function(x) psi.pks(x,14) curve(f1, from=0.01, 0.99) xi <- seq(0,1, 0.01) head(cbind(xi, "f1(xi)" = f1(xi))) ks.test(datamoths, "punif", min=0, max=25) d14 <- max( (1:n)/n - val, val - (0:(n-1))/n ) d14 1 - psi.pks(d14,14) ks.test(datamoths, "punif", min=0, max=25)
datasales <- c(2,4,8,18,9,11,13) ks.test(datasales, "pnorm", mean=10, sd=3)
# file.show("barbiturate.txt") databarbi <- scan("barbiturate.txt") databarbi lillie.test(databarbi) ks.test(databarbi,pnorm,mean=mean(databarbi), sd=sd(databarbi))
databulbs <- scan("bulbs1000.txt") databulbs ks.test(databulbs, "pexp", rate=2) ks.test(databulbs, "pexp", rate=1/1000) mean(databulbs) 1/mean(databulbs) # apropos("psi.") # ?psi.lks.exp.test psi.lks.exp.test(databulbs)
databulbs15 <- scan("bulbs1500.txt") psi.lks.exp.test(databulbs15) psi.lks.exp.test(databulbs15) psi.lks.exp.test(databulbs15, Nsim=10000) # example: bulbs: is this the same as bulbs1000? # bulbs mean(bulbs) ks.test(bulbs, pexp, rate = 1/1500) ks.test(bulbs, pexp, rate = 1/1000)
Carry out Lilliefors test (simulation is used, so slight differences in repeated calculation)
psi.lks.exp.test(bulbs) psi.lks.exp.test(bulbs) psi.lks.exp.test(bulbs, Nsim = 10000) # for more precision
(semi-)diy (full diy would also calc. the Dn stat. by diy)
z <- bulbs/mean(bulbs) zDn <- psi.Dn(z, pexp, rate=1)
crit. value at alpha=0.05
DnN0p05 <- psi.qlks.exp(1-0.05, length(z))
or, for more precision,
DnN0p05 <- psi.qlks.exp(1-0.05, length(z), Nsim=10000) zDn > DnN0p05 # p-value 1 - psi.plks.exp(zDn, length(z), Nsim=10000)
library(mlmRev) summary(Oxboys) # Oxboys are from library(mlmRev) Oxboys[1:5,] Oxboys[1:5, "height"] dataoxheight <- Oxboys[,"height"] plot(dataoxheight) summary(dataoxheight) hist(dataoxheight, freq=FALSE) # todo: overlay exp pdf? boxplot(dataoxheight) qqnorm(dataoxheight) library(nortest) lillie.test(dataoxheight) ks.test(dataoxheight, pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight)) # for comparison ks.test(unique(dataoxheight), "pnorm", mean=mean(dataoxheight), sd=sd(dataoxheight)) # add some jitter to remove ties; ?psi.jitter # ?psi.jitter ks.test(psi.jitter(dataoxheight,amount=0.5), pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight)) ks.test(psi.jitter(dataoxheight,amount=0.5), pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight)) #?psi.Dn psi.Dn(dataoxheight, pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight), ties.jitter=TRUE) # ?psi.jitter any(duplicated(dataoxheight)) xj <- psi.jitter(dataoxheight, 0.5) any(duplicated(xj)) ks.test(xj, "pnorm", mean=mean(dataoxheight), sd = sd(dataoxheight)) ########################################### datamig ls(pattern="data*") databarbi databulbs
datamilk <- data.frame( x = c(42.7,40.2,38.2,37.6,32.2,32.2,28,27.2,26.6,23,22.7,21.8,21.3,20.2), y = c(1.2,1.16,1.07,1.13,0.96,1.07,0.85,0.87,0.77,0.74,0.76,0.69,0.72,0.64) ) fitmilk <- lm(y~x, data=datamilk) datamilk fitmilk summary(fitmilk) # plot(fitmilk) resfitmilk <- residuals(fitmilk) lillie.test(resfitmilk)
todo: also qq-plot? ## Lilliefors test for exponentiality
?psi.plks ?psi.plks.exp ?psi.lks.exp.test ?psi.Dn ?psi.pks
databulbs <- scan("bulbs1000.txt") ks.test(databulbs, "pexp", rate=2) mean(databulbs) 1/mean(databulbs) ks.test(databulbs, "pexp", rate=1/1000) psi.plks.exp(0.203, df = length(databulbs)) 1 - psi.plks.exp(0.203, df = length(databulbs)) curve(psi.pks(x,length(databulbs)), from=0, to=1) curve(psi.plks.exp(x,length(databulbs)), from=0, to=1, col="blue", add=TRUE) abline(h=0.95) # ?names # ?c databulbs ks.test(databulbs, "pexp", rate=1/1000) psi.Dn(databulbs, "pexp", rate=1/1000)
q : P(Dn < q) = 0.95 for selected values of n
psi.pks(0.6239385, 4) psi.pks(0.2940753, 20) psi.pks(0.1340279, 100) psi.pks(0.04294685, 1000) # asymptotic approximation databulbs psi.lks.exp.test(databulbs) length(databulbs) psi.plks.exp(0.203, df=10) 1 - psi.plks.exp(0.203, df=10)
title: "ks.R" author: "mcbssgb2" date: "Tue Jan 09 20:18:38 2018"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.