Quality of distribution fits
Description
Calculate goodness of fit for several distributions, plot rank comparison.
Usage
1 2 
Arguments
dlf 
List as returned by 
gofProp 
Overrides value in list. Proportion (0:1) of highest values in

plot 
Call 
progbars 
Show progress bars for each loop? DEFAULT: TRUE if n > 200 
ks 
Include ks.test results in 
weightc 
Optional: a named vector with custom weights for each distribution.
Are internally normalized to sum=1 after removing nonfitted dists.
Names must match the parameter names from 
quiet 
Suppress notes? DEFAULT: FALSE 
... 
Further arguments passed to 
Value
List as explained in extremeStat
. The added element is gof,
a data.frame the root mean square error (RMSE) and R squared (R2) for the top gofProp
of dat
,
if ks=TRUE, the p and D values from a simple ks.test,
as well as weights by three different approaches for each distribution function.
The weights are inverse to RMSE, weight1 for all dists, weight2 places zero weight on the worst function, weight3 on the worst half of functions.
Note
If you get a note in distLgof: NAs removed in CDF ...
, this
probably means that the support of some of the fitted distributions do not
span the whole data range. Instead the outside support regions get NAs that
are then detected by rmse and rsquare. I plan to fix this with WHA's new supdist.
Author(s)
Berry Boessenkool, berryb@gmx.de, Sept 2014 + July 2015
See Also
distLgofPlot
, distLfit
.
More complex estimates of quality of fits:
Fard, M.N.P. and Holmquist, B. (2013, Chilean Journal of Statistics):
Powerful goodnessoffit tests for the extreme value distribution.
http://chjs.mat.utfsm.cl/volumes/04/01/Fard_Holmquist(2013).pdf
Examples
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96  library(lmomco)
data(annMax)
# Goodness of Fit is measured by RMSE of cumulated distribution function and ?ecdf
dlf < distLfit(annMax, cdf=TRUE, nbest=17)
distLplot(dlf, cdf=TRUE, sel=c("wak", "revgum"))
dlf$gof
x < sort(annMax)
segments(x0=x, y0=plmomco(x, dlf$parameter$revgum), y1=ecdf(annMax)(x), col=2)
segments(x0=x, y0=plmomco(x, dlf$parameter$wak), y1=ecdf(annMax)(x), col=4, lwd=2)
plot(ecdf(annMax), add=TRUE)
# RMSE: root of average of ( errors squared ) , errors = line distances
# weights by three different weighting schemes
distLgofPlot(dlf, ranks=FALSE, weights=TRUE)
distLgof(dlf, ks=TRUE, plot=FALSE)$gof
# KolmogorovSmirnov Tests return slightly different values:
ks.test(annMax, "pnorm", mean(annMax), sd(annMax) )$p.value
ks.test(annMax, "cdfnor", parnor(lmoms(annMax)))$p.value
# GOF: how well do the distributions fit the original data?
pairs(dlf$gof[,1:3]) # measures of goodness of fit are correlated quite well here.
# In the next example, however, we see that it does matter which one is used.
# compare Ranks with different proportions used for GOF
par(mfrow=c(1,2))
distLgofPlot(dlf, weights=FALSE)
d < distLgof(dlf, gofProp=0.8, plot=TRUE, weights=FALSE)
par(mfrow=c(1,1))
# effect of Proportion of values used to calculate RMSE
dlf100 < distLfit(annMax, gofProp=1, nbest=19, breaks=10) # the default gofProp: 100%
distLgofPlot(dlf100)
dlf50 < distLgof(dlf100, gofProp=0.5)
# so revgum, nor and rice do well on the upper half by R2, but bad by RMSE
distLplot(dlf50, breaks=10)
# The red dashed line shows the cut above which the data were used to get R2/rmse
distLplot(dlf=dlf50, cdf=TRUE)
distLplot(dlf=dlf50, selection=c("pe3","wei", "rice", "nor", "revgum"),
xlim=c(60,120), ylim=c(0.5, 1), cdf=TRUE, col=1)
dlf50$gof
compranks < function(d)
{
gofProp < 0.5
x < sort(annMax, decreasing=TRUE)[ 1:(gofProp*length(annMax)) ]
tcdfs < plmomco(x,dlf50$parameter[[d]])
ecdfs < ecdf(annMax)(x) # Empirical CDF
# Root Mean Square Error, R squared:
berryFunctions::linReg(tcdfs, ecdfs, lwd=1, pch=16, main=d, digits=5, xlim=c(0.5, 1),
ylim=c(0.5, 1), pos1="topleft")
abline(a=0,b=1, lty=3)
c(berryFunctions::rmse(tcdfs, ecdfs), berryFunctions::rsquare(tcdfs, ecdfs))
}
dn < rownames(dlf50$gof)
op < par(mfrow=c(5,4), mar=rep(1,4), xaxt="n", yaxt="n")
for(i in dn) compranks(i)
par(op)
# so revgum, nor and rice systematically deviate from ECDF.
# RMSE is better to sort by than R2
## Not run: ## to save CRAN check computing time
# custom weights
cw < c("gpa"=7, "gev"=3, "wak"=6, "wei"=4, "kap"=3.5, "gum"=3, "ray"=2.1,
"ln3"=2, "pe3"=2.5, "gno"=4, "gam"=5)
dlf < distLfit(annMax, plot=FALSE, weightc=cw)
distLgofPlot(dlf, ranks=TRUE)
dev.new()
distLplot(dlf50, cdf=TRUE, sel=c("pe3", "rice", "revgum"), order=T)
x < sort(annMax, decreasing=TRUE)[ 1:(0.5*length(annMax)) ]
tcdfs < plmomco(x,dlf50$parameter[["revgum"]])
ecdfs < ecdf(annMax)(x) # Empirical CDF
plot(x, tcdfs, type="o", col=2)
points(x, ecdfs)
dev.new()
linReg(tcdfs, ecdfs, type="o")
abline(a=0,b=1, lty=3)
dev.new(record=TRUE)
for(i in 1:10/10) distLgof(dlf100, gofProp=i, weights=FALSE,
main=paste("upper", i*100, "% used for R2"))
# depending on which proportion of the data the GOF is calculated with, different
# distributions are selected to be the 5 best.
dev.off()
## End(Not run) # end dontrun
