## based on the following documents from 2014/2015:
##    - QQandKSnew.html (R source lost)

```r
## package 'psistat' is discussed further below, so don't echo it here.
library(psistat)
set.seed(1234)

Packages and functions for topic "goodness of fit"

Functions in base R

?ks.test
?ecdf
?qqnorm
?qqline

Package "psistat"

Package psistat provides some useful functions for this topic. You can install it on your own computers from the web page of the course (if you have not done this yet).

library(psistat)

The names of the functions in "psistat" start with "psi.". You can learn more about them using the help system. For example:

apropos("psi.*")
help(package="psistat")
package ? psistat
?psi.eqf
?psi.Dn
?psi.lks.exp.test
?psi.jitter

Other libraries providing relevant functions:

library(nortest) # for Lilliefors test
?lillie.test

```r
library(e1071)  # for probplot
?probplot

Dataset Oxboys is in package "mlmRev"

library(mlmRev)
# ?Oxboys

Miscellaneous: use this command to plot in a new window:

dev.new()

Some random samples for the following examples.

x20 <- rnorm(20)            # a short random sample, to show it on screen
x200 <- rnorm(200)          # larger
x500 <- rnorm(500)          # even larger
x500ms <- rnorm(500, mean = 1, sd = 3)

x2 <- rchisq(100, df = 2)

xe <- rexp(500, rate = 1/1500) # life times of electric bulbs

Empirical cdf (ecdf)

x20.ecdf <- ecdf(x20)              # its ecdf
x20.ecdf

x20.ecdf is a function - we can use it as any other R function:

x20.ecdf(0)
x20.ecdf(10)

Here we plot the ecdf and overlay the cdf used to simulate the data. The sample is small, so the two do not match very closely:

plot(x20.ecdf)
curve(pnorm, add = TRUE, col = "blue")

At the places were the ecdf jumps, the value is the one after the jump (indicated by the filled points). Here is a histogram with overlayed pdf for comparison.

hist(x20, freq = FALSE, ylim = c(0, 0.5))
curve(dnorm, add = TRUE)

For larger samples the ecdf is very good estimator of the underlying cdf:

x200.ecdf <- ecdf(x200)
plot(x200.ecdf)
curve(pnorm, add = TRUE, col = "red")

... and an even larger sample:

x500.ecdf <- ecdf(x500)            # its ecdf
plot(x500.ecdf)                    # ecdf and cdf overlayed
curve(pnorm, add = TRUE, col = "red")
## This shows how to add the curve with non-default values of the
## arguments. This simply illustrates the technique:
norm2 <- function(x) pnorm(x, mean = 2)
curve(norm2, add = TRUE, col = "blue")

Empirical quantiles

# ?quantile
quantile(x500, probs=0.5)
quantile(x500, probs= c(0.25,0.5, 0.75))

quantile(x20, 0.5)
hist(x20, freq=FALSE, ylim = c(0, 0.5))
curve(dnorm, from = -3, to = 3,  add = TRUE, col = "blue")

Examples with psi.eqf

Empirical quantile function (from package "psistat")

?psi.eqf

The eqf of a small sample:

x20.eqf <- psi.eqf(x20)
plot(x20.eqf, main = "Eqf of x20")
curve(qnorm, add = TRUE, col = "blue")

# plot(x200)
x200ecdf <- ecdf(x200)
x200ecdf(0)
quantile(x200, c(.25, .5, 0.75))

x200eqf <- psi.eqf(x200)
x200eqf(0.5)
x200eqf(0.25)
plot(x200eqf, main = "Eqf of x200")

x500.eqf <- psi.eqf(x500)          # a larger sample
plot(x500.eqf, main = "Eqf of x500")
curve(qnorm, add = TRUE, col = "blue")

```r
## #' Illustrate that quantiles can be plotted even without quantile f. (The "Weekend fun"
## #' question that stood for several weeks.)
## y500  <- pnorm(x500)
## plot(sort(x500), sort(y500))  # plot a cdf
##
## plot(sort(y500), sort(x500))  # plot the inverse cdf without using quantile function

QQ-plots

DIY qq-plots

We follow the procedure given in the notes: prepare points p, evaluate the scores s, compute the order statistics and plot. Does x20 come from N(0,1)?

n <- length(x20)
n

p <- (1:n - 0.5)   # to check that it gives 0.5, 1.5, ..., 19.5
p

p <- (1:n - 0.5)/n
p

s <- qnorm(p)            # scores
x20.sorted <- sort(x20)  # order statistics
plot(s, x20.sorted)      # qq-plot
abline(0, 1, col = "blue") # plot the reference line

Does x2 come from N(0,1)?

n <- length(x2)
p <- ((1:n) - 0.5) / n
s2 <- qnorm(p)
x2s <- sort(x2)
plot(s2, x2s)
abline(0, 1, col = "red")

Does x200 come from N(0,1)?

n <- length(x200)
p <- ((1:n) - 0.5) / n
si <- qnorm(p)
x200.sorted <- sort(x200)
plot(si, x200.sorted)
abline(0, 1, col = "blue")

Does x200 come from Student-t with 3 d.f.?

si <- qt(p, df = 3)
plot(si, x200.sorted)
abline(0,1, col = "blue")

Does x500 come from N(0,1)?

n <- length(x500)                  # x500
p <- (1:n - 0.5)/n
s <- qnorm(p)
x500.sorted <- sort(x500)
plot(s, x500.sorted)
abline(0,1)                        # various ways to overlay a line
lm(x500.sorted ~ s)

abline(lm(x500.sorted ~ s), col = "red")
qqline(x500.sorted)

qq-plots for x500 The following examples are of qq-plots for x500 with various non-matching distributions.

s <- qt(p, df = 3)      # H0 is t_3 dist
plot(s, x500.sorted)
abline(0, 1, col="blue")

s <- qnorm(p, mean = 5) # H0 is N(5,1)
plot(s,x500.sorted)
abline(0, 1, col = "blue")

s <- qnorm(p, mean = 5, sd = 2) # H0 is N(5,2^2)
plot(s, x500.sorted)
abline(0, 1, col = "blue")

x500ms.sorted <- sort(x500ms)
p <- ((1:500) - 0.5)/500
s <- qnorm(p)
plot(s, x500ms.sorted)
abline(0, 1, col = "blue")

lmfit500 <- lm(x500ms.sorted ~ s)
summary(lmfit500)
abline(lmfit500, col = "red")

qq-plot example with an exponential distribution. First generate some data to work with: xe might represent lifetime of bulbs (incandescent have typical expected life equal to 1500 hours).

hist(xe)

n <- length(xe)
p <- ((1:n) - 0.5) / n
s <- qexp(p, rate = 1/1500)
plot(s, sort(xe))

Non-diy qq-plots

qqnorm produces a normal qq-plot, i.e. a qq-plot for hypothesised normal distribution:

qqnorm(x500ms)
abline(0,1)

qqnorm(x500)                   # qqnorm
qqnorm(xe)     # no match here

probplot() is another function for qq-plots. Without further arguments it produces a normal plot:

library(e1071, quiet = TRUE)  # for probplot
probplot(x500)

For other hypothesised distributions the relevant quantile function should be given. We often need to define the function ourselves.

#
# this doesn't work since probplot insists on naming the argument 'p'
# qexp1500 <- function(x) qexp(x, rate=1/1500)
#
                                        # here we oblige.

This function computes quantiles for exponential distribution with mean 1500:

qexp1500 <- function(p) qexp(p, rate = 1/1500)

(Note that probplot insists that the first argument is called 'p'.) Compare:

qexp1500(0.5)
qexp(0.5, rate=1/1500)

Create the plot:

probplot(xe, qexp1500)

x2a <- rnorm(500, mean=5, sd=2)
qqnorm(x2a)

mean(x2a)
sd(x2a)

example: bulbs.txt

bulbs <- scan("bulbs.txt")
bulbs

qexp1500 <- function(p){ qexp(p, rate=1/1500) }
probplot(bulbs, qdist = qexp1500)

e100 <- rexp(100, rate=1/1500)
p <- ((1:100) - 0.5)/100
s <- qexp(p, rate=1/1500)
e100s <- sort(e100)
plot(s, e100s)
abline(0,1)

e1000 <- rexp(1000, rate=1/1500)
p <- ((1:1000) - 0.5)/1000
s <- qexp(p, rate=1/1500)
e1000s <- sort(e1000)
plot(s, e1000s)
abline(0,1)

s <- qnorm(p)
plot(s, e1000s)

library(e1071)
probplot(e1000, qexp)
probplot(e1000, qnorm)

# ?qexp

qchisq5 <- function(p) qchisq(p, df=5)
probplot(e1000,qchisq5)

Distributions of order statistics

Approx mean and variance of order statistics example in Notes, p. 69

first using formulae on pp. 68-69

qnorm(4/(19+1))

ex <- qnorm(4/(19+1))
ex

exvar <- 4*(19-4+1)/((19+1)^2*(19+2))/dnorm(ex)^2
exvar

... then via simulation

y <- numeric(1000)
y[1] <- sort(rnorm(19))[4]
y[2] <- sort(rnorm(19))[4]

and so on until y[1000] ... :)

# ... but better use a single command
for(i in 1:1000) y[i] <- sort(rnorm(19))[4]

hist(y, freq=FALSE)  # estimate density

mean(y)              # estimate mean

var(y)               # estimate variance

this is an alternative to the 'for' loop (explain "replicate")

y <- replicate(1000, sort(rnorm(19))[4])
mean(y)
var(y)
quantile(y, c(0.25,0.5,0.75))

N <- 1000
x4 <- numeric(N)
x4[1] <- sort(rnorm(19))[4]
x4[1]
x4[2] <- sort(rnorm(19))[4]
# ... and so on; let's do it with a single command:
for(i in 1:N) x4[i] <- sort(rnorm(19))[4]

Explore the distribution of the fourth order statistic:

mean(x4)
var(x4)
hist(x4)

x4ecdf <- ecdf(x4)
curve(x4ecdf, from = -2, to = 2)

plot(x4ecdf)
curve(pnorm, add = TRUE, col = "red")

Inverse PIT

DIY generation of a random sample from distribution Expo(1/2) First generate a sample from the uniform distribution.

u <- runif(8)
u

Evaluate the quantiles of the required distribution (Expo(1/2) here) for the values in the U(0,1) random sample:

y <- -2*log(1-u)     #  DIY quantile function of Expo(1/2)
y

y1 <- qexp(u, rate = 1/2) # built-in quantile function
y1

The results are the same (so, qexp uses the formula -log(1-u)/lambda):

all(y == y1)

Kolmogorov-Smirnov tests

u <- runif(100)
x <- -1/2*log(1-u)
ks.test(x, "pexp", rate=2)

ks.test(x, "pexp", rate=10)

pexp2 <- function(x) pexp(x, rate = 2)
ks.test(x, pexp2)

psi.Dn(x, pexp2)

x <- runif(10)
x

y <- -1/2*log(1-x)
y

y1 <- qexp(x, rate=2)
y1

all(y==y1)  # TRUE (so, qexp uses the above formula

The cdf of the test statistic in KS test See examples for psi.pks.

# ?psi.pks
psi.pks(0.6239385,4)

psi.pks(0.2940753,20)
psi.pks(0.1340279,100)
psi.pks(0.04294685,1000)

Compare the distribution of the test statistic for various sample sizes:

xi <- seq(0,1,length=100)            # some x values
plot(xi,psi.pks(xi,4))
lines(xi,psi.pks(xi,4))              # cdf of D_4
lines(xi,psi.pks(xi,50),col="blue")  # overlay the cdf of  D_{50}
lines(xi,psi.pks(xi,100),col="red")  # overlay the cdf of  D_{100}
abline(h=0.95, col="brown")
lines(xi,psi.pks(xi,1000),col="green")  # overlay the cdf of  D_{1000}

The abscissa of the intersection of the brown line with each of the curves gives the corresponding critical value of the KS test. Notice that for larger samples the critical value is smaller. In other words, smaller deviations from the hypothesised distribution function are considered significant. Similar to above using curve():

curve(psi.pks(x,4), from=0, to=1)                             # cdf of D_4
curve(psi.pks(x,10), from=0, to=1, col="blue", add=TRUE)      # cdf of D_10
curve(psi.pks(x,50), from=0, to=1, col="brown", add=TRUE)     # cdf of D_50
curve(psi.pks(x,100), from=0, to=1, col="red", add=TRUE)      # cdf of D_100
curve(psi.pks(x,1000), from=0, to=1, col="green", add=TRUE)   # cdf of D_1000
abline(h=0.95, col="brown")

DIY Dn

x10 <- rnorm(10)

diy Dn for H_0 = cdf of N(0,1)

u10 <- pnorm(x10)
u10

u10 <- sort(pnorm(x10))
u10

n <- 10
x10Dn <- max((1:n)/n - u10, u10 - (0:(n-1)/n))
x10Dn

now use psi.Dn to compute Dn, should give the same result.

psi.Dn(x10)

# KS test
#
# ?psi.Dn
x

psi.Dn(x)

The value of the statistic is the same as that from ks.test:

ks.test(x, pnorm)

psi.Dn(x) == ks.test(x, pnorm)$statistic

pe <- function(x) pexp(x, rate=1/1500)
ks.test(x, pe)

KS test 2013/2014 chunk

x20

x20os <- sort(x20)
pnorm(x20os, mean=3, sd=2)

pnorm(x20os, mean=3, sd=2) - (0:(20-1))/n


wrk1 <- pnorm(x20os, mean=3, sd=2) - (0:(20-1))/20
wrk2 <- (1:20)/20 - pnorm(x20os, mean=3, sd=2)
max(wrk1,wrk2)

Dn.x20 <- max(wrk1,wrk2)
Dn.x20

psi.Dn(x20)

# ?psi.Dn
psi.Dn(x20, cdf=pnorm, mean=3, sd=2)

ks.test(x20, pnorm, mean=3, sd=2)

# ?psi.pks
psi.pks(0.5, n=20)

Example migraine

#
# file.show("migraine.txt")
datamig <- scan("migraine.txt")
datamig

##  [1]  98  90 155  86  80  84  70 128  93  40 108  90 130  48  55 106 145
## [18] 126 100 115  75  95  38  66  63  32 105 118  21 142

ks.test(datamig, pnorm, mean=90, sd=35)

ks.test(unique(datamig), pnorm, mean=90, sd=35)

psi.Dn(unique(datamig), pnorm, mean=90, sd=35)

Example from help page of psi.pks

xi <- seq(0,1,length=100)            # some x values
plot(xi, psi.pks(xi,4))               # cdf of D_4

diy Dn

x <- unique(datamig)
x

xs <- sort(x)
xs

n <- length(xs)
max( (1:n)/n - pnorm(xs,mean=90,sd=35), pnorm(xs,mean=90,sd=35) - (0:(n-1))/n)

Dn using psi.Dn

psi.Dn(unique(datamig), pnorm, mean=90, sd=35)

ks.test(unique(datamig),pnorm, mean=90, sd=35)

migDn <- max( (1:n)/n - pnorm(xs,mean=90,sd=35), pnorm(xs,mean=90,sd=35) - (0:(n-1))/n)
migDn

psi.pks(migDn, n)

1 - psi.pks(migDn, n)

Lilliefors test for normality

apropos("psi.")
?psi.pkls.exp
?psi.plks.exp
?psi.qlks.exp

This package provides lillie.test():

library(nortest)
# ?lillie.test
lillie.test(x20)

Further examples

Example: moths

# file.show("mothsontrees.txt")
datamoths <- scan("mothsontrees.txt")
datamoths

# ?punif
ks.test(datamoths, punif, min=0, max=25)

Alternatively:

mycdf <- function(q) punif(q, min=0, max=25)
ks.test(datamoths, mycdf)

mothssorted <- sort(datamoths)

val <- punif(mothssorted, min=0, max=25)
length(datamoths)

n <- 14
(1:n)/n

(1:n)/n - val

val - (0:(n-1))/n

max( (1:n)/n - val, val - (0:(n-1))/n )

ks.test(datamoths, "punif", min=0, max=25)

f1 <- function(x) psi.pks(x,14)
curve(f1, from=0.01, 0.99)

xi <- seq(0,1, 0.01)
head(cbind(xi, "f1(xi)" = f1(xi)))

ks.test(datamoths, "punif", min=0, max=25)

d14 <- max( (1:n)/n - val, val - (0:(n-1))/n )
d14

1 - psi.pks(d14,14)

ks.test(datamoths, "punif", min=0, max=25)

Example: datasales

datasales <- c(2,4,8,18,9,11,13)
ks.test(datasales, "pnorm", mean=10, sd=3)

Example: barbiturate

# file.show("barbiturate.txt")
databarbi <- scan("barbiturate.txt")
databarbi

lillie.test(databarbi)

ks.test(databarbi,pnorm,mean=mean(databarbi), sd=sd(databarbi))

Example: bulbs1000

databulbs <- scan("bulbs1000.txt")
databulbs

ks.test(databulbs, "pexp", rate=2)

ks.test(databulbs, "pexp", rate=1/1000)

mean(databulbs)
1/mean(databulbs)

# apropos("psi.")
# ?psi.lks.exp.test
psi.lks.exp.test(databulbs)

Example: bulbs1500

databulbs15 <- scan("bulbs1500.txt")
psi.lks.exp.test(databulbs15)

psi.lks.exp.test(databulbs15)

psi.lks.exp.test(databulbs15, Nsim=10000)


# example: bulbs: is this the same as bulbs1000?
#
bulbs
mean(bulbs)

ks.test(bulbs, pexp, rate = 1/1500)
ks.test(bulbs, pexp, rate = 1/1000)

Carry out Lilliefors test (simulation is used, so slight differences in repeated calculation)

psi.lks.exp.test(bulbs)
psi.lks.exp.test(bulbs)

psi.lks.exp.test(bulbs, Nsim = 10000)   # for more precision

(semi-)diy (full diy would also calc. the Dn stat. by diy)

z <- bulbs/mean(bulbs)
zDn <- psi.Dn(z, pexp, rate=1)

crit. value at alpha=0.05

DnN0p05 <- psi.qlks.exp(1-0.05, length(z))

or, for more precision,

DnN0p05 <- psi.qlks.exp(1-0.05, length(z), Nsim=10000)

zDn > DnN0p05

# p-value
1 - psi.plks.exp(zDn, length(z), Nsim=10000)

Example: Oxboys (needs clean-up)

library(mlmRev)
summary(Oxboys)          # Oxboys are from library(mlmRev)

Oxboys[1:5,]
Oxboys[1:5, "height"]

dataoxheight <- Oxboys[,"height"]
plot(dataoxheight)

summary(dataoxheight)

hist(dataoxheight, freq=FALSE)  # todo: overlay exp pdf?

boxplot(dataoxheight)

qqnorm(dataoxheight)

library(nortest)
lillie.test(dataoxheight)


ks.test(dataoxheight, pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight)) # for comparison

ks.test(unique(dataoxheight), "pnorm", mean=mean(dataoxheight), sd=sd(dataoxheight))

# add some jitter to remove ties;  ?psi.jitter
# ?psi.jitter
ks.test(psi.jitter(dataoxheight,amount=0.5), pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight))

ks.test(psi.jitter(dataoxheight,amount=0.5), pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight))

#?psi.Dn
psi.Dn(dataoxheight, pnorm, mean=mean(dataoxheight), sd=sd(dataoxheight), ties.jitter=TRUE)

# ?psi.jitter
any(duplicated(dataoxheight))

xj <- psi.jitter(dataoxheight, 0.5)
any(duplicated(xj))

ks.test(xj, "pnorm", mean=mean(dataoxheight), sd = sd(dataoxheight))


###########################################

datamig

ls(pattern="data*")

databarbi
databulbs

Example: checking normality of residuals from lm()

datamilk <-
   data.frame(
     x = c(42.7,40.2,38.2,37.6,32.2,32.2,28,27.2,26.6,23,22.7,21.8,21.3,20.2),
     y = c(1.2,1.16,1.07,1.13,0.96,1.07,0.85,0.87,0.77,0.74,0.76,0.69,0.72,0.64)
   )
fitmilk <- lm(y~x, data=datamilk)
datamilk

fitmilk
summary(fitmilk)

# plot(fitmilk)
resfitmilk <- residuals(fitmilk)
lillie.test(resfitmilk)

todo: also qq-plot? ## Lilliefors test for exponentiality

?psi.plks
?psi.plks.exp
?psi.lks.exp.test
?psi.Dn
?psi.pks
databulbs <- scan("bulbs1000.txt")
ks.test(databulbs, "pexp", rate=2)

mean(databulbs)

1/mean(databulbs)

ks.test(databulbs, "pexp", rate=1/1000)

psi.plks.exp(0.203, df = length(databulbs))

1 - psi.plks.exp(0.203, df = length(databulbs))

curve(psi.pks(x,length(databulbs)), from=0, to=1)
curve(psi.plks.exp(x,length(databulbs)), from=0, to=1, col="blue", add=TRUE)
abline(h=0.95)

# ?names
# ?c
databulbs


ks.test(databulbs, "pexp", rate=1/1000)

psi.Dn(databulbs, "pexp", rate=1/1000)

q : P(Dn < q) = 0.95 for selected values of n

psi.pks(0.6239385,     4)
psi.pks(0.2940753,    20)
psi.pks(0.1340279,   100)
psi.pks(0.04294685, 1000)   # asymptotic approximation

databulbs
psi.lks.exp.test(databulbs)

length(databulbs)

psi.plks.exp(0.203, df=10)

1 - psi.plks.exp(0.203, df=10)

title: "ks.R" author: "mcbssgb2" date: "Tue Jan 09 20:18:38 2018"




GeoBosh/psistat documentation built on Nov. 19, 2020, 8:19 p.m.