#' Fit distributions via L-moments
#'
#' Fit several distributions via L-moments with \code{lmomco::\link[lmomco]{lmom2par}}
#' and compute goodness of fit measures.
#'
#' @return invisible dlf object, see \code{\link{printL}}.
#' @author Berry Boessenkool, \email{berry-b@@gmx.de}, Sept 2014, July 2015, Dec 2016
#' @seealso \code{\link{plotLfit}}, \code{\link{distLweights}}, \code{\link{plotLweights}},
#' \code{extRemes::\link[extRemes]{fevd}}, \code{MASS::\link[MASS]{fitdistr}}.\cr
#' More complex estimates of quality of fits:
#' Fard, M.N.P. and Holmquist, B. (2013, Chilean Journal of Statistics):
#' Powerful goodness-of-fit tests for the extreme value distribution.
#' https://chjs.mat.utfsm.cl/volumes/04/01/Fard_Holmquist(2013).pdf
#'
#' @keywords hplot dplot distribution univar
#' @export
#' @importFrom lmomco dist.list lmoms lmom2par plmomco
#' @importFrom berryFunctions rainbow2 tryStack rmse rsquare quantileMean
#' @importFrom utils getFromNamespace
#' @importFrom stats ecdf ks.test
#'
#' @examples
#'
#' data(annMax)
#' # basic usage on real data (annual discharge maxima in Austria)
#' dlf <- distLfit(annMax)
#' str(dlf, max.lev=2)
#' printL(dlf)
#' plotLfit(dlf)
#'
#' # arguments that can be passed to plotting function:
#' plotLfit(dlf, lty=2, col=3, nbest=17, legargs=list(lwd=3), main="booh!")
#' set.seed(42)
#' dlf_b <- distLfit(rbeta(100, 5, 2))
#' plotLfit(dlf_b, nbest=10, legargs=c(x="left"))
#' plotLfit(dlf_b, selection=c("gpa", "glo", "gev", "wak"))
#' plotLfit(dlf_b, selection=c("gpa", "glo", "gev", "wak"), order=TRUE)
#' plotLfit(dlf_b, distcols=c("orange",3:6), lty=1:3) # lty is recycled
#' plotLfit(dlf_b, cdf=TRUE)
#' plotLfit(dlf_b, cdf=TRUE, histargs=list(do.points=FALSE), sel="nor")
#'
#'
#' # logarithmic axes:
#' set.seed(1)
#' y <- 10^rnorm(300, mean=2, sd=0.3) # if you use 1e4, distLfit will be much slower
#' hist(y, breaks=20)
#' berryFunctions::logHist(y, col=8)
#' dlf <- distLfit(log10(y))
#' plotLfit(dlf, breaks=50)
#' plotLfit(dlf, breaks=50, log=TRUE)
#'
#'
#' # Goodness of fit: how well do the distributions fit the original data?
#' # measured by RMSE of cumulated distribution function and ?ecdf
#' # RMSE: root of average of ( errors squared ) , errors = line distances
#' dlf <- distLfit(annMax, ks=TRUE)
#' plotLfit(dlf, cdf=TRUE, sel=c("wak", "revgum"))
#' x <- sort(annMax)
#' segments(x0=x, y0=lmomco::plmomco(x, dlf$parameter$revgum), y1=ecdf(annMax)(x), col=2)
#' segments(x0=x, y0=lmomco::plmomco(x, dlf$parameter$wak), y1=ecdf(annMax)(x), col=4, lwd=2)
#' # weights by three different weighting schemes, see distLweights:
#' plotLweights(dlf)
#' plotLfit(distLfit(annMax ), cdf=TRUE, nbest=17)$gof
#' plotLfit(distLfit(annMax, truncate=0.7), cdf=TRUE, nbest=17)$gof
#' pairs(dlf$gof[,-(2:5)]) # measures of goodness of fit are correlated quite well here.
#' dlf$gof
#'
#' # Kolmogorov-Smirnov Tests for normal distribution return slightly different values:
#' library(lmomco)
#' ks.test(annMax, "pnorm", mean(annMax), sd(annMax) )$p.value
#' ks.test(annMax, "cdfnor", parnor(lmoms(annMax)))$p.value
#'
#'
#' # Fit all available distributions (30):
#' \dontrun{# this takes a while...
#' d_all <- distLfit(annMax, speed=FALSE, progbars=TRUE) # 20 sec
#' printL(d_all)
#' plotLfit(d_all, nbest=30, distcols=grey(1:22/29), xlim=c(20,140))
#' plotLfit(d_all, nbest=30, ylim=c(0,0.04), xlim=c(20,140))
#' plotLweights(d_all)
#' d_all$gof
#' }
#'
#' @param dat Vector with values
#' @param datname Character string for main, xlab etc.
#' DEFAULT: \code{deparse(substitute(dat))}
#' @param selection Selection of distributions. Character vector with types
#' as in \code{\link[lmomco]{lmom2par}}. Overrides speed. DEFAULT: NULL
#' @param speed If TRUE, several distributions are omitted, for the reasons
#' shown in \code{lmomco::\link[lmomco]{dist.list}()}. DEFAULT: TRUE
#' @param ks Include ks.test results and CDF R^2 in \code{dlf$gof}?
#' Computing is much faster when FALSE. DEFAULT: FALSE
#' @param truncate Number between 0 and 1. POT Censored \code{\link{distLquantile}}:
#' fit to highest values only (truncate lower proportion of x).
#' Probabilities are adjusted accordingly. DEFAULT: 0
#' @param threshold POT cutoff value. If you want correct percentiles,
#' set this only via truncate, see Details of \code{\link{q_gpd}}.
#' DEFAULT: \code{\link[berryFunctions]{quantileMean}(x, truncate)}
#' @param progbars Show progress bars for each loop? DEFAULT: TRUE if n > 200
#' @param time \code{\link{message}} execution time? DEFAULT: TRUE
#' @param quiet Suppress notes? DEFAULT: FALSE
#' @param ssquiet Suppress sample size notes? DEFAULT: quiet
#' @param \dots Further arguments passed to \code{\link{distLweights}}
#' like weightc, order=FALSE
#'
distLfit <- function(
dat,
datname=deparse(substitute(dat)),
selection=NULL,
speed=TRUE,
ks=FALSE,
truncate=0,
threshold=berryFunctions::quantileMean(dat, truncate),
progbars=length(dat)>200,
time=TRUE,
quiet=FALSE,
ssquiet=quiet,
... )
{
# preparation ------------------------------------------------------------------
StartTime <- Sys.time()
datname <- datname # evaluate promise before dat is changed
# Progress bars
if(quiet) progbars <- FALSE
if(progbars) lapply <- pbapply::pblapply
# checks:
if( ! is.numeric(dat) ) stop("dat must be numeric.")
if(!is.vector(dat) & !quiet) message("Note in distLfit: dat was not a vector.")
# remove NAs, convert to vector:
dat_full <- dat
dat <- as.numeric(dat)
if(any(!is.finite(dat)))
{
Na <- !is.finite(dat)
if(!quiet) message("Note in distLfit: ", sum(Na), " Inf/NA",
ifelse(sum(Na)>1,"s were"," was")," omitted from ", length(dat),
" data points (",round(sum(Na)/length(dat)*100,1),"%).")
dat <- dat[!Na]
}
# truncate (fit values only to upper part of values):
dat <- dat[dat>=threshold]
# GPD fits in q_gpd all use x>t, not x>=t, but if t=0, I want to use _all_ data
dat <- sort(dat, decreasing=TRUE)
# possible / selected distributions --------------------------------------------
dn <- lmomco::dist.list()
names(dn) <- dn
# remove some to save time and errors, see ?dist.list # gld, gov and tri added
if(speed) dn <- dn[ ! dn %in%
c("aep4","cau","emu","gep","gld","gov","kmu","kur","lmrq","sla","st3","texp","tri")]
#
# Selection:
if( ! is.null(selection) )
{
if(!is.character(selection)) stop("Since Version 0.4.36 (2015-08-31), 'selection' _must_ be a character string vector.")
nondn <- selection[!selection %in% dn]
if(length(nondn)>0 & !quiet) message("Note in distLfit: selection (", toString(nondn),
") not available in lmomco::dist.list().")
dn <- selection ; names(dn) <- dn
}
#
# output list ------------------------------------------------------------------
dlf <- list(parameter=as.list(replace(dn,TRUE,NA)),
dat_full=dat_full, dat=dat, datname=datname,
distnames=dn, distcols=berryFunctions::rainbow2(length(dn)),
distselector="distLfit", distfailed="",
truncate=truncate, threshold=threshold)
# Check remaining sample size:
if(length(dat) < 5)
{
if(!ssquiet) message("Note in distLfit: sample size (", length(dat),
") is too small to fit parameters (<5).")
# error output:
dlf$gof <- distLweights(replace(dn,TRUE,NA), quiet=TRUE, ...)
dlf$error <- paste0("dat size too small (",length(dat),")")
dlf$distfailed <- dn
return(invisible(dlf))
}
#
# Fit distribution parameters --------------------------------------------------
# This is the actual work...
# L-Moments of sample # package lmomco
mom <- lmomco::lmoms(dat, nmom=5)
if(lmomco::are.lmom.valid(mom))
{
# estimate parameters for each distribution: # this takes time!
if(progbars) message("Parameter estimation from L-moments:")
dlf$parameter <- lapply(dn, function(d) tryStack(lmomco::lmom2par(mom, type=d), silent=TRUE) )
} else
{
if(!quiet) message("Note in distLfit: L-moments are not valid. No distributions are fitted.")
dlf$gof <- distLweights(replace(dn,TRUE,NA), quiet=TRUE, ...)
dlf$error <- c(error="invalid lmomco::lmoms", mom)
dlf$distfailed <- dn
}
names(dlf$parameter) <- dn
#
# Error check ------------------------------------------------------------------
failed <- sapply(dlf$parameter, function(x)
{
if(is.null(x)) return(TRUE)
if(all(is.na(x))) return(TRUE)
if(inherits(x, "try-error")) return(TRUE)
if(x$type=="kap")
{
if(x$ifail!=0) return(TRUE)
if(round(x$support[2],7)<=round(x$support[1],7) ) return(TRUE)
quant <- lmomco::quakap(seq(truncate,1,len=200), para=x)
# plot(xxx<-seq(-100,50,len=900), lmomco::pdfkap(xxx, para=x), type="l")
if(length(unique(quant))<20) return(TRUE) # xx5 in test-quantile
}
cumuprob <- suppressWarnings(try(lmomco::plmomco(mean(dlf$dat),x), silent=TRUE))
if(is.null(cumuprob)||inherits(cumuprob,"try-error")) return(TRUE)
anyNA(x$para)
})
if(any(failed))
{
dlf$parameter[failed] <- NA # needed in kappa cases like xx3 in test-quantile
dlf$distfailed <- dn[failed]
if(!quiet) message("Note in distLfit: ", sum(failed),"/",length(dn),
" distributions could not be fitted: ", toString(dlf$distfailed),
if(sum(!failed)<2) "\nGOF cannot be compared")
###dn <- dn[!failed]
}
# Goodness of Fit --------------------------------------------------------------
# CDFS for RMSE (and R2): (dat must be sorted at this point)
if(progbars) message("Calculating CDFs:")
# Theoretical CumulatedDensityFunctions:
tcdfs <- suppressWarnings(
lapply(dn, function(d) tryStack(lmomco::plmomco(dat,dlf$parameter[[d]]), silent=TRUE)))
# support region check:
if(!quiet)
{
nNA <- sapply(tcdfs, function(x) sum(is.na(x)))
if(any(nNA>0))
message("Note in distLfit: there are NAs in CDF (distribution support region ",
"probably does not span the whole data range): ",
toString(paste0(dn[nNA>0], " (", nNA[nNA>0], ")" )),
" of ", length(tcdfs[[1]]), " values.")
}
ecdfs <- ecdf(dlf$dat)(dat) # Empirical CDF
# rescale for truncation
tcdfs[sapply(tcdfs, inherits, "try-error")] <- NA
# RMSE:
if(progbars) message("Calculating RMSE:")
RMSE <- suppressWarnings(
lapply(dn, function(d) tryStack(rmse(tcdfs[[d]], ecdfs, quiet=TRUE),
silent=TRUE)))
# change nonfitted distributions RMSE to NA:
RMSE <- sapply(RMSE, function(x) if(inherits(x, "try-error")) NA else x)
# distribution weights:
dlf$gof <- distLweights(RMSE, quiet=if(sum(!failed)<2) TRUE else quiet, ...)
# ks and R^2 values:
if(ks)
{
# Kolmogorov-Smirnov test:
if(progbars) message("Performing ks.test:")
## library("lmomco") # flagged by R CMD check, not necessary if plmomco is imported
ksA <- suppressWarnings(
lapply(dn, function(d) tryStack(ks.test(dlf$dat, "plmomco",
dlf$parameter[[d]]), silent=TRUE)))
ksP <- sapply(ksA, function(x) x$p.value )
ksD <- sapply(ksA, function(x) x$statistic )
# R2
if(progbars) message("Calculating R2:")
R2 <- suppressWarnings(
lapply(dn, function(d) tryStack(rsquare(tcdfs[[d]], ecdfs, quiet=TRUE),
silent=TRUE)))
R2 <- sapply(R2, function(x) if(inherits(x, "try-error")) NA else x)
# add to output data.frame:
dlf$gof$ksP <- ksP[rownames(dlf$gof)]
dlf$gof$ksD <- ksD[paste0(rownames(dlf$gof),".D")]
dlf$gof$R2 <- R2[rownames(dlf$gof)]
}
#
# time message + output --------------------------------------------------------
dlf$distnames <- rownames(dlf$gof)
if(time & !quiet) message("distLfit execution took ",
signif(difftime(Sys.time(), StartTime, units="s"),2), " seconds.")
return(invisible(dlf))
} # end of function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.