# R/distLfit.R In extremeStat: Extreme Value Statistics and Quantile Estimation

#### Documented in distLfit

```#' Fit distributions via L-moments
#'
#' Fit several distributions via L-moments with \code{lmomco::\link[lmomco]{lmom2par}}
#' and compute goodness of fit measures.
#'
#' @return invisible dlf object, see \code{\link{printL}}.
#' @author Berry Boessenkool, \email{[email protected]@gmx.de}, Sept 2014, July 2015, Dec 2016
#'     More complex estimates of quality of fits:
#'     Fard, M.N.P. and Holmquist, B. (2013, Chilean Journal of Statistics):
#'     Powerful goodness-of-fit tests for the extreme value distribution.
#'     http://chjs.mat.utfsm.cl/volumes/04/01/Fard_Holmquist(2013).pdf
#'
#' @keywords hplot dplot distribution univar
#' @export
#' @importFrom lmomco dist.list lmoms lmom2par plmomco
#' @importFrom berryFunctions rainbow2 tryStack rmse rsquare quantileMean
#' @importFrom utils getFromNamespace
#' @importFrom stats ecdf ks.test
#'
#' @examples
#'
#' data(annMax)
#' # basic usage on real data (annual discharge maxima in Austria)
#' dlf <- distLfit(annMax)
#' str(dlf, max.lev=2)
#' printL(dlf)
#' plotLfit(dlf)
#'
#' # arguments that can be passed to plotting function:
#' plotLfit(dlf, lty=2, col=3, nbest=17, legargs=list(lwd=3), main="booh!")
#' set.seed(42)
#' dlf_b <- distLfit(rbeta(100, 5, 2))
#' plotLfit(dlf_b, nbest=10, legargs=c(x="left"))
#' plotLfit(dlf_b, selection=c("gpa", "glo", "gev", "wak"))
#' plotLfit(dlf_b, selection=c("gpa", "glo", "gev", "wak"), order=TRUE)
#' plotLfit(dlf_b, distcols=c("orange",3:6), lty=1:3) # lty is recycled
#' plotLfit(dlf_b, cdf=TRUE)
#' plotLfit(dlf_b, cdf=TRUE, histargs=list(do.points=FALSE), sel="nor")
#'
#'
#' # logarithmic axes:
#' set.seed(1)
#' y <- 10^rnorm(300, mean=2, sd=0.3) # if you use 1e4, distLfit will be much slower
#' hist(y, breaks=20)
#' berryFunctions::logHist(y, col=8)
#' dlf <- distLfit(log10(y))
#' plotLfit(dlf, breaks=50)
#' plotLfit(dlf, breaks=50, log=TRUE)
#'
#'
#' # Goodness of fit: how well do the distributions fit the original data?
#' # measured by RMSE of cumulated distribution function and ?ecdf
#' # RMSE: root of average of ( errors squared )  ,   errors = line distances
#' dlf <- distLfit(annMax, ks=TRUE)
#' plotLfit(dlf, cdf=TRUE, sel=c("wak", "revgum"))
#' x <- sort(annMax)
#' segments(x0=x, y0=lmomco::plmomco(x, dlf\$parameter\$revgum), y1=ecdf(annMax)(x), col=2)
#' segments(x0=x, y0=lmomco::plmomco(x, dlf\$parameter\$wak), y1=ecdf(annMax)(x), col=4, lwd=2)
#' # weights by three different weighting schemes, see distLweights:
#' plotLweights(dlf)
#' plotLfit(distLfit(annMax              ), cdf=TRUE, nbest=17)\$gof
#' plotLfit(distLfit(annMax, truncate=0.7), cdf=TRUE, nbest=17)\$gof
#' pairs(dlf\$gof[,-(2:5)]) # measures of goodness of fit are correlated quite well here.
#' dlf\$gof
#'
#' # Kolmogorov-Smirnov Tests for normal distribution return slightly different values:
#' library(lmomco)
#' ks.test(annMax, "pnorm", mean(annMax), sd(annMax) )\$p.value
#' ks.test(annMax, "cdfnor", parnor(lmoms(annMax)))\$p.value
#'
#'
#' # Fit all available distributions (30):
#' \dontrun{# this takes a while...
#' d_all <- distLfit(annMax, speed=FALSE, progbars=TRUE) # 20 sec
#' printL(d_all)
#' plotLfit(d_all, nbest=30, distcols=grey(1:22/29), xlim=c(20,140))
#' plotLfit(d_all, nbest=30, ylim=c(0,0.04), xlim=c(20,140))
#' plotLweights(d_all)
#' d_all\$gof
#' }
#'
#' @param dat       Vector with values
#' @param datname   Character string for main, xlab etc.
#'                  DEFAULT: \code{deparse(substitute(dat))}
#' @param selection Selection of distributions. Character vector with types
#'                  as in \code{\link[lmomco]{lmom2par}}. Overrides speed. DEFAULT: NULL
#' @param speed     If TRUE, several distributions are omitted, for the reasons
#'                  shown in \code{lmomco::\link[lmomco]{dist.list}()}. DEFAULT: TRUE
#' @param ks        Include ks.test results and CDF R^2 in \code{dlf\$gof}?
#'                  Computing is much faster when FALSE. DEFAULT: FALSE
#' @param truncate  Number between 0 and 1. POT Censored \code{\link{distLquantile}}:
#'                  fit to highest values only (truncate lower proportion of x).
#'                  Probabilities are adjusted accordingly. DEFAULT: 0
#' @param threshold POT cutoff value. If you want correct percentiles,
#'                  set this only via truncate, see Details of \code{\link{q_gpd}}.
#' @param progbars  Show progress bars for each loop? DEFAULT: TRUE if n > 200
#' @param time      \code{\link{message}} execution time? DEFAULT: TRUE
#' @param quiet     Suppress notes? DEFAULT: FALSE
#' @param ssquiet   Suppress sample size notes? DEFAULT: quiet
#' @param \dots     Further arguments passed to \code{\link{distLweights}}
#'                  like weightc, order=FALSE
#'
distLfit <- function(
dat,
datname=deparse(substitute(dat)),
selection=NULL,
speed=TRUE,
ks=FALSE,
truncate=0,
threshold=berryFunctions::quantileMean(dat, truncate),
progbars=length(dat)>200,
time=TRUE,
quiet=FALSE,
ssquiet=quiet,
... )
{
# preparation ------------------------------------------------------------------
StartTime <- Sys.time()
datname <- datname # evaluate promise before dat is changed
# Progress bars
if(quiet) progbars <- FALSE
if(progbars) lapply <- pbapply::pblapply
# checks:
if( ! is.numeric(dat) ) stop("dat must be numeric.")
if(!is.vector(dat) & !quiet) message("Note in distLfit: dat was not a vector.")
# remove NAs, convert to vector:
dat_full <- dat
dat <- as.numeric(dat)
if(any(!is.finite(dat)))
{
Na <- !is.finite(dat)
if(!quiet) message("Note in distLfit: ", sum(Na), " Inf/NA",
ifelse(sum(Na)>1,"s were"," was")," omitted from ", length(dat),
" data points (",round(sum(Na)/length(dat)*100,1),"%).")
dat <- dat[!Na]
}
# truncate (fit values only to upper part of values):
dat <- dat[dat>=threshold]
# GPD fits in q_gpd all use x>t, not x>=t, but if t=0, I want to use _all_ data
dat <- sort(dat, decreasing=TRUE)
# possible / selected distributions --------------------------------------------
dn <- lmomco::dist.list()
names(dn) <- dn
# remove some to save time and errors, see ?dist.list # gld, gov and tri added
if(speed) dn <- dn[ ! dn %in%
c("aep4","cau","emu","gep","gld","gov","kmu","kur","lmrq","sla","st3","texp","tri")]
#
# Selection:
if( ! is.null(selection) )
{
if(!is.character(selection)) stop("Since Version 0.4.36 (2015-08-31), 'selection' _must_ be a character string vector.")
nondn <- selection[!selection %in% dn]
if(length(nondn)>0 & !quiet) message("Note in distLfit: selection (", toString(nondn),
") not available in lmomco::dist.list().")
dn <- selection ; names(dn) <- dn
}
#
# output list ------------------------------------------------------------------
dlf <- list(parameter=as.list(replace(dn,TRUE,NA)),
dat_full=dat_full, dat=dat, datname=datname,
distnames=dn, distcols=berryFunctions::rainbow2(length(dn)),
distselector="distLfit", distfailed="",
truncate=truncate, threshold=threshold)

# Check remaining sample size:
if(length(dat) < 5)
{
if(!ssquiet) message("Note in distLfit: sample size (", length(dat),
") is too small to fit parameters (<5).")
# error output:
dlf\$gof <- distLweights(replace(dn,TRUE,NA), quiet=TRUE, ...)
dlf\$error <- paste0("dat size too small (",length(dat),")")
dlf\$distfailed <- dn
return(invisible(dlf))
}
#
# Fit distribution parameters --------------------------------------------------
# This is the actual work...
# L-Moments of sample  # package lmomco
mom <- lmomco::lmoms(dat, nmom=5)
if(lmomco::are.lmom.valid(mom))
{
# estimate parameters for each distribution:    # this takes time!
if(progbars) message("Parameter estimation from L-moments:")
dlf\$parameter <- lapply(dn, function(d) tryStack(lmomco::lmom2par(mom, type=d), silent=TRUE) )
} else
{
if(!quiet) message("Note in distLfit: L-moments are not valid. No distributions are fitted.")
dlf\$gof <- distLweights(replace(dn,TRUE,NA), quiet=TRUE, ...)
dlf\$error <- c(error="invalid lmomco::lmoms", mom)
dlf\$distfailed <- dn
}
names(dlf\$parameter) <- dn
#
# Error check ------------------------------------------------------------------
failed <- sapply(dlf\$parameter, function(x)
{
if(is.null(x)) return(TRUE)
if(all(is.na(x))) return(TRUE)
if(inherits(x, "try-error")) return(TRUE)
if(x\$type=="kap")
{
if(x\$ifail!=0) return(TRUE)
if(round(x\$support[2],7)<=round(x\$support[1],7) ) return(TRUE)
quant <- lmomco::quakap(seq(truncate,1,len=200), para=x)
# plot(xxx<-seq(-100,50,len=900), lmomco::pdfkap(xxx, para=x), type="l")
if(length(unique(quant))<20) return(TRUE) # xx5 in test-quantile
}
cumuprob <- suppressWarnings(try(lmomco::plmomco(mean(dlf\$dat),x), silent=TRUE))
if(is.null(cumuprob)||inherits(cumuprob,"try-error")) return(TRUE)
anyNA(x\$para)
})
if(any(failed))
{
dlf\$parameter[failed] <- NA # needed in kappa cases like xx3 in test-quantile
dlf\$distfailed <- dn[failed]
if(!quiet) message("Note in distLfit: ", sum(failed),"/",length(dn),
" distributions could not be fitted: ", toString(dlf\$distfailed),
if(sum(!failed)<2) "\nGOF cannot be compared")
###dn <- dn[!failed]
}
# Goodness of Fit --------------------------------------------------------------
# CDFS for RMSE (and R2): (dat must be sorted at this point)
if(progbars) message("Calculating CDFs:")
# Theoretical CumulatedDensityFunctions:
tcdfs <- suppressWarnings(
lapply(dn, function(d) tryStack(lmomco::plmomco(dat,dlf\$parameter[[d]]), silent=TRUE)))
# support region check:
if(!quiet)
{
nNA <- sapply(tcdfs, function(x) sum(is.na(x)))
if(any(nNA>0))
message("Note in distLfit: there are NAs in CDF (distribution support region ",
"probably does not span the whole data range): ",
toString(paste0(dn[nNA>0], " (", nNA[nNA>0], ")" )),
" of ", length(tcdfs[[1]]), " values.")
}
ecdfs <- ecdf(dlf\$dat)(dat) # Empirical CDF
# rescale for truncation
tcdfs[sapply(tcdfs, inherits, "try-error")] <- NA
# RMSE:
if(progbars) message("Calculating RMSE:")
RMSE <- suppressWarnings(
lapply(dn, function(d)  tryStack(rmse(tcdfs[[d]], ecdfs, quiet=TRUE),
silent=TRUE)))
# change nonfitted distributions RMSE to NA:
RMSE <- sapply(RMSE, function(x) if(inherits(x, "try-error")) NA else x)
# distribution weights:
dlf\$gof <- distLweights(RMSE, quiet=if(sum(!failed)<2) TRUE else quiet, ...)
# ks and R^2 values:
if(ks)
{
# Kolmogorov-Smirnov test:
if(progbars) message("Performing ks.test:")
## library("lmomco") # flagged by R CMD check, not necessary if plmomco is imported
ksA <- suppressWarnings(
lapply(dn, function(d) tryStack(ks.test(dlf\$dat, "plmomco",
dlf\$parameter[[d]]), silent=TRUE)))
ksP <- sapply(ksA, function(x) x\$p.value   )
ksD <- sapply(ksA, function(x) x\$statistic )
# R2
if(progbars) message("Calculating R2:")
R2 <- suppressWarnings(
lapply(dn, function(d)  tryStack(rsquare(tcdfs[[d]], ecdfs, quiet=TRUE),
silent=TRUE)))
R2 <- sapply(R2, function(x) if(inherits(x, "try-error")) NA else x)
dlf\$gof\$ksP <- ksP[rownames(dlf\$gof)]
dlf\$gof\$ksD <- ksD[paste0(rownames(dlf\$gof),".D")]
dlf\$gof\$R2  <-  R2[rownames(dlf\$gof)]
}
#
# time message + output --------------------------------------------------------
dlf\$distnames <- rownames(dlf\$gof)
if(time & !quiet) message("distLfit execution took ",
signif(difftime(Sys.time(), StartTime, units="s"),2), " seconds.")
return(invisible(dlf))
} # end of function
```

## Try the extremeStat package in your browser

Any scripts or data that you put into this service are public.

extremeStat documentation built on May 2, 2019, 3:26 a.m.