options(digits = 3)
require(fitdistrplus)
set.seed(123)
addbounds <- function(d)
{
  xbounds <- c(d$left, d$right)
  xboundsnotNA <- xbounds[!is.na(xbounds)]
  abline(v = xboundsnotNA, col = "grey")
}
addLR <- function(d)
{
  Lbounds <- d$left[!is.na(d$left)]
  Rbounds <- d$right[!is.na(d$right)]
  range <- range(c(Lbounds,Rbounds)) 
  eps <- (range[2] - range[1]) * 0.01
  text(x = Lbounds-eps, y = 0.05, labels = "L", col = "red")
  text(x = Rbounds+eps, y = 0.05, labels = "R", col = "red")
}

addeq <- function(d)
{
  left <- d$left
  left[is.na(left)] <- -100
  right <- d$right
  right[is.na(right)] <- 100
  rect(left, -2, right, 2, density = 10)
}

addTurnbull <- function(d, conf.int = FALSE)
{
  (survdata <- Surv(time = d$left, time2 = d$right, type="interval2"))
  survfitted <- survfit(Surv(left, right, type="interval2") ~ 1, data = d)
  points(survfitted$time, 1 - survfitted$surv, col = "orange", type = "s", 
         lwd = 5, lty = 2)
  if (conf.int == TRUE)
  {
      points(survfitted$time, 1 - survfitted$lower, col = "orange", type = "s", 
             lwd = 2, lty = 3)
      points(survfitted$time, 1 - survfitted$upper, col = "orange", type = "s", 
             lwd = 2, lty = 3)
  }
}

General presentation of the package fitdistrplus

A package to help the fit of parametric distributions to univariate discrete our continuous non censored or censored data.

par(mfrow = c(1,3))
set.seed(124)
r <- rgamma(100, shape = 8, rate = 1)
f <- fitdist(r, "gamma")
b <- bootdist(f)
f.mme <- fitdist(r, "gamma", method = "mme")
f.mge <- fitdist(r, "gamma", method = "mge", gof="AD2R")
f.qme <- fitdist(r, "gamma", method = "qme", probs = c(0.25, 0.75))
denscomp(list(f, f.mme, f.qme, f.mge), legendtext = c("MLE", "MME", "QME", "MGE"),
         main = "Different fitting methods", xlim = c(2, 16), fitlty = 1)
CIcdfplot(b, CI.output = "quantile", CI.fill = "pink", main = "Uncertainty using bootstrap")

rd <- rpois(50, lambda = 5)
fp <- fitdist(rd, "pois")
denscomp(fp, demp = TRUE, main = "Fit of discrete distributions")

Goodness-of-fit plots for non censored data

An example with non censored data

r <- rweibull(100, shape = 3, scale = 1)
fw <- fitdist(r, "weibull")
fg <- fitdist(r, "gamma")
fl <- fitdist(r, "lnorm")
gofstat(list(fw, fg, fl), 
        fitnames = c("Weibull", "gamma", "lnorm"))

A goodness-of-fit plot in density plot

denscomp(list(fw, fg, fl), demp = TRUE, fitlty = 1)

A goodness-of-fit plot in CDF

cdfcomp(list(fw, fg, fl), fitlty = 1)

A Q-Q plot which emphasizes differences at tails

qqcomp(list(fw, fg, fl))

a P-P plot which emphasizes differences in the center

ppcomp(list(fw, fg, fl))

Representation of the ECDF for censored data

How to represent an ECDF from censored data ?

A first toy example with left, right and interval censored data

d0 <- data.frame(left = c(NA, 2, 4, 7, 10), right = c(1, 2.5, 6, 8, NA))
d <- d0
d
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "") 

Non Parametric Maximum Likelihood Estimation (NPMLE) of the ECDF: the Turnbull plot (package survival) used in former versions of fitdistrplus.

par(mar = c(3, 4, 0.1, 0.1))
layout(matrix(c(1,2,2,2), nrow =4, byrow = TRUE))
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "")
addbounds(d)
plotdistcens(d, NPMLE.method = "Turnbull", lwd = 5, col = "orange", main = "")
addbounds(d)

A new algorithm and plot from the package npsurv (Wang)

# par(mfrow = c(2, 1))
layout(matrix(c(1,2,2,2), nrow =4, byrow = TRUE))
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "")
addbounds(d)
plotdistcens(d, NPMLE = TRUE, lwd = 5, main = "")
addTurnbull(d)
addbounds(d)
legend("right", lty = c(1,2), lwd = c(5,5), col = c("black", "orange"),
       legend = c("Wang", "Turnbull"), bty = "n", cex = 2)

The two steps of an NPMLE algorithm

  1. Identification of equivalence classes (also named Turnbull intervals or maximal intersection intervals or innermost intervals or maximal cliques of the data) = set of points/intervals under which the ECDF may change (each region between a left bound \textcolor{red}{L} immediately followed by a right bound \textcolor{red}{R}, even if of null length). The NPMLE is unique only up to these equivalence classes (non uniqueness represented by grey rectangles).

Equivalence classes on the first toy example

par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "")
addbounds(d)
addLR(d)
addeq(d)

The two steps of an NPMLE algorithm

  1. Identification of equivalence classes (also named Turnbull intervals or maximal intersection intervals or innermost intervals or maximal cliques of the data) = set of points/intervals under which the ECDF may change (each region between a left bound \textcolor{red}{L} immediately followed by a right bound \textcolor{red}{R}, even if of null length). The NPMLE is unique only up to these equivalence classes (non uniqueness represented by grey rectangles).

Equivalence classes on a second toy example

d0b <- data.frame(left = c(NA, 2, 4, 6, 10), right = c(1, 3, 7, 8, NA))
d <- d0b
deq <- data.frame(left = c(NA, 2, 6, 10), right = c(1, 3, 7, NA))
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "")
addbounds(d)
addLR(d)
addeq(deq)

The two steps of an NPMLE algorithm

  1. Identification of equivalence classes (also named Turnbull intervals or maximal intersection intervals or innermost intervals or maximal cliques of the data) = set of points/intervals under which the ECDF may change (each region between a left bound \textcolor{red}{L} immediately followed by a right bound \textcolor{red}{R}, even if of null length). The NPMLE is unique only up to these equivalence classes (non uniqueness represented by grey rectangles).

  2. Assign a probability mass to each equivalence class (may be 0).

The Wang plot on the second toy example

par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = TRUE, lwd = 5, main = "")
addbounds(d)

The two steps of an NPMLE algorithm

  1. Identification of equivalence classes (also named Turnbull intervals or maximal intersection intervals or innermost intervals or maximal cliques of the data) = set of points/intervals under which the ECDF may change (each region between a left bound \textcolor{red}{L} immediately followed by a right bound \textcolor{red}{R}, even if of null length). The NPMLE is unique only up to these equivalence classes (non uniqueness represented by grey rectangles).

  2. Assign a probability mass to each equivalence class (may be 0).

Various algorithms implemented in the packages Icens, interval and npsurv (more or less performant and not all handling left censored data).

A third toy example

d3 <- data.frame(left = c(-1.4, -1.4, 2, -1.4, 0),
                 right = c(1, 2, NA, 0, 2))
d <- d3
# par(mfrow = c(2, 1))
layout(matrix(c(1,2,2,2), nrow =4, byrow = TRUE))
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "")
addbounds(d)
addLR(d)
plotdistcens(d, NPMLE = TRUE, lwd = 5, main = "")
addTurnbull(d)
addbounds(d)
legend("right", lty = c(1,2), lwd = c(5,5), col = c("black", "orange"),
       legend = c("Wang", "Turnbull"), bty = "n", cex = 2)

The third toy example with the add of a non censored obs.

d3b <- data.frame(left = c(1.18, -1.4, -1.4, 2, -1.4, 0),
                 right = c(1.18, 1, 2, NA, 0, 2))
d <- d3b
# par(mfrow = c(2, 1))
layout(matrix(c(1,2,2,2), nrow =4, byrow = TRUE))
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 5, col = "blue", main = "")
addbounds(d)
addLR(d)
plotdistcens(d, NPMLE = TRUE, lwd = 5, main = "")
addTurnbull(d)
addbounds(d)
legend("right", lty = c(1,2), lwd = c(5,5), col = c("black", "orange"),
       legend = c("Wang", "Turnbull"), bty = "n", cex = 2)

A realistic example: data salinity

data(salinity)
d <-  log10(salinity)
# par(mfrow = c(2, 1))
layout(matrix(c(1,2,2,2), nrow =4, byrow = TRUE))
par(mar = c(3, 4, 0.1, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 0.2, col = "blue", main = "")
addbounds(d)
addLR(d)
plotdistcens(d, NPMLE = TRUE, lwd = 5, main = "")
addbounds(d)

New CDF, Q-Q and P-P plots implemented for censored data

Use of cdfcompcens() to assess the fit of 3 distributions on data smokedfish

data(smokedfish)
d <-  log10(smokedfish)
fitsfn <- fitdistcens(d,"norm")
fitsfl <- fitdistcens(d,"logis")
dgumbel <- function(x,a,b) 1/b*exp((a-x)/b)*exp(-exp((a-x)/b))
pgumbel <- function(q,a,b) exp(-exp((a-q)/b))
qgumbel <- function(p,a,b) a-b*log(-log(p))
fitsfg<-fitdistcens(d,"gumbel",start=list(a=-3,b=3))
# par(mfrow = c(2, 1))
layout(matrix(c(1,2,2), nrow =3, byrow = TRUE))
par(mar = c(3, 4, 2, 0.1))
plotdistcens(d, NPMLE = FALSE, lwd = 0.2, col = "blue", main = "")
# addbounds(d)
cdfcompcens(list(fitsfn,fitsfl,fitsfg), fitlty = 1)
# addbounds(d)

Use of qqcompcens() for one distribution

par(mfrow = c(1, 2))
par(mar = c(4, 4, 2, 1))
cdfcompcens(fitsfn, fitlty = 1)
qqcompcens(fitsfn)

Use of ppcompcens() for one distribution

par(mfrow = c(1, 2))
par(mar = c(4, 4, 2, 1))
cdfcompcens(fitsfn, fitlty = 1)
ppcompcens(fitsfn)

Q-Q plots and P-P plot for the 3 distributions

par(mfrow = c(1, 2))
par(mar = c(4, 4, 2, 1))
qqcompcens(list(fitsfn,fitsfl,fitsfg))
ppcompcens(list(fitsfn,fitsfl,fitsfg))

An alternative presentation of the Q-Q plots for the 3 dist.

par(mfrow = c(2,2))
par(mar = c(4, 4, 2, 1))
qqcompcens(fitsfn, fitcol = "red", fillrect = "red")
qqcompcens(fitsfl, fitcol = "green", fillrect = "green")
qqcompcens(fitsfg, fitcol = "blueviolet", fillrect = "blueviolet")

Will be soon implemented in the plotstyle ggplot.

Another example with data salinity

data(salinity)
d <-  log10(salinity)
f <- fitdistcens(d,"norm")
par(mfrow = c(2,2))
par(mar = c(4, 4, 2, 1))
plotdistcens(d, NPMLE = FALSE, main = "")
cdfcompcens(f)
qqcompcens(f)
ppcompcens(f)

How to use of these new goodness-of-fit plots ?

Example of code:

data(smokedfish)
d <-  log10(smokedfish)
# Plot of the NPMLE CDF on censored data
plotdistcens(d)
# Two MLE fits
fitsfn <- fitdistcens(d,"norm")
fitsfl <- fitdistcens(d,"logis")
# Three goodness-of-fit plots for one fit
plot(fitsfn)
# Goodness-of-fit plots for one or more fits
cdfcompcens(list(fitsfn,fitsfl))
qqcompcens(list(fitsfn,fitsfl))
ppcompcens(list(fitsfn,fitsfl))

Other recent improvements of fitdistrplus

Version 1.0-8

Version 1.0-10

Version 1.0-11

References

\begin{center} \begin{Large} Thank you for your attention !

We are waiting for your feedback on these new tools. \end{Large} \end{center}



aursiber/fitdistrplus documentation built on Oct. 18, 2024, 1:20 a.m.