Quality of distribution fits

Share:

Description

Calculate goodness of fit for several distributions, plot rank comparison.

Usage

1
2
distLgof(dlf, gofProp, plot = TRUE, progbars = length(dlf$dat) > 200,
  ks = TRUE, weightc = NA, quiet = FALSE, ...)

Arguments

dlf

List as returned by distLfit, containing the elements dat, datname, parameter, gofProp

gofProp

Overrides value in list. Proportion (0:1) of highest values in dat to compute goodness of fit (dist / ecdf) with. This enables to focus on the dist tail

plot

Call distLgofPlot? DEFAULT: TRUE

progbars

Show progress bars for each loop? DEFAULT: TRUE if n > 200

ks

Include ks.test results in dlf$gof? Computing is much faster when FALSE. DEFAULT: TRUE

weightc

Optional: a named vector with custom weights for each distribution. Are internally normalized to sum=1 after removing nonfitted dists. Names must match the parameter names from distLfit. DEFAULT: NA

quiet

Suppress notes? DEFAULT: FALSE

...

Further arguments passed to distLgofPlot

Value

List as explained in extremeStat. The added element is gof,
a data.frame the root mean square error (RMSE) and R squared (R2) for the top gofProp of dat,
if ks=TRUE, the p and D values from a simple ks.test,
as well as weights by three different approaches for each distribution function.
The weights are inverse to RMSE, weight1 for all dists, weight2 places zero weight on the worst function, weight3 on the worst half of functions.

Note

If you get a note in distLgof: NAs removed in CDF ..., this probably means that the support of some of the fitted distributions do not span the whole data range. Instead the outside support regions get NAs that are then detected by rmse and rsquare. I plan to fix this with WHA's new supdist.

Author(s)

Berry Boessenkool, berry-b@gmx.de, Sept 2014 + July 2015

See Also

distLgofPlot, distLfit. More complex estimates of quality of fits: http://chjs.soche.cl/papers/vol4n1_2013/ChJS-04-01-04.pdf

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library(lmomco)
data(annMax)

# Goodness of Fit is measured by RMSE of cumulated distribution function and ?ecdf
dlf <- distLfit(annMax, cdf=TRUE, nbest=17)
distLplot(dlf, cdf=TRUE, sel=c("wak", "revgum"))
dlf$gof
x <- sort(annMax)
segments(x0=x, y0=plmomco(x, dlf$parameter$revgum), y1=ecdf(annMax)(x), col=2)
segments(x0=x, y0=plmomco(x, dlf$parameter$wak), y1=ecdf(annMax)(x), col=4, lwd=2)
plot(ecdf(annMax), add=TRUE)
# RMSE: root of average of ( errors squared )  ,   errors = line distances

# weights by three different weighting schemes
distLgofPlot(dlf, ranks=FALSE, weights=TRUE)
distLgof(dlf, ks=TRUE, plot=FALSE)$gof

# Kolmogorov-Smirnov Tests return slightly different values:
ks.test(annMax, "pnorm", mean(annMax), sd(annMax) )$p.value
ks.test(annMax, "cdfnor", parnor(lmoms(annMax)))$p.value


# GOF: how well do the distributions fit the original data?
pairs(dlf$gof[,1:3]) # measures of goodness of fit are correlated quite well here.
# In the next example, however, we see that it does matter which one is used.

# compare Ranks with different proportions used for GOF
par(mfrow=c(1,2))
distLgofPlot(dlf, weights=FALSE)
d <- distLgof(dlf, gofProp=0.8, plot=TRUE, weights=FALSE)
par(mfrow=c(1,1))


# effect of Proportion of values used to calculate RMSE
dlf100 <- distLfit(annMax, gofProp=1, nbest=19, breaks=10) # the default gofProp: 100%
distLgofPlot(dlf100)

dlf50 <- distLgof(dlf100, gofProp=0.5)
# so revgum, nor and rice do well on the upper half by R2, but bad by RMSE
distLplot(dlf50, breaks=10)
# The red dashed line shows the cut above which the data were used to get R2/rmse

distLplot(dlf=dlf50, cdf=TRUE)
distLplot(dlf=dlf50, selection=c("pe3","wei", "rice", "nor", "revgum"),
          xlim=c(60,120), ylim=c(0.5, 1), cdf=TRUE, col=1)
dlf50$gof

compranks <- function(d)
{
gofProp <- 0.5
x <- sort(annMax, decreasing=TRUE)[  1:(gofProp*length(annMax))  ]
tcdfs <- plmomco(x,dlf50$parameter[[d]])
ecdfs <- ecdf(annMax)(x) # Empirical CDF
# Root Mean Square Error, R squared:
berryFunctions::linReg(tcdfs, ecdfs, lwd=1, pch=16, main=d, digits=5, xlim=c(0.5, 1),
       ylim=c(0.5, 1), pos1="topleft")
abline(a=0,b=1, lty=3)
c(berryFunctions::rmse(tcdfs, ecdfs), berryFunctions::rsquare(tcdfs, ecdfs))
}
dn <- rownames(dlf50$gof)

op <- par(mfrow=c(5,4), mar=rep(1,4), xaxt="n", yaxt="n")
for(i in dn) compranks(i)
par(op)
# so revgum, nor and rice systematically deviate from ECDF.
# RMSE is better to sort by than R2


## Not run:  ## to save CRAN check computing time

# custom weights
cw <- c("gpa"=7, "gev"=3, "wak"=6, "wei"=4, "kap"=3.5, "gum"=3, "ray"=2.1,
        "ln3"=2, "pe3"=2.5, "gno"=4, "gam"=5)
dlf <- distLfit(annMax, plot=FALSE, weightc=cw)
distLgofPlot(dlf, ranks=TRUE)


dev.new()
distLplot(dlf50, cdf=TRUE, sel=c("pe3", "rice", "revgum"), order=T)
x <- sort(annMax, decreasing=TRUE)[  1:(0.5*length(annMax))  ]
tcdfs <- plmomco(x,dlf50$parameter[["revgum"]])
ecdfs <- ecdf(annMax)(x) # Empirical CDF
plot(x, tcdfs, type="o", col=2)
points(x, ecdfs)
dev.new()
linReg(tcdfs, ecdfs, type="o")
abline(a=0,b=1, lty=3)

dev.new(record=TRUE)
for(i in 1:10/10) distLgof(dlf100, gofProp=i, weights=FALSE,
                           main=paste("upper", i*100, "% used for R2"))
# depending on which proportion of the data the GOF is calculated with, different
# distributions are selected to be the 5 best.
dev.off()

## End(Not run) # end dontrun