floodStat: FloodNet Regional Frequency Analysis

######################################################################
#' Peak over threshold (POT)
#'
#' Fit the parameters of a model using peak over threshold (POT).
#'
#' @param x Sample.
#'
#' @param dt Date or time of observation.
#'
#' @param u Threshold.
#'
#' @param method Estimation method. Either \code{'lmom'}, \code{'mle'} or
#' \code{'mle2'}.
#'
#' @param declust If necessary, declustering method.
#'   Either 'run' or 'wrc'.
#'
#' @param r Lag parameter for declustering. Either the running length betwee clusters
#'   or the minimum separating time between two flood Peaks.
#'   The scale must coincide with the observation date \code{dt}.
#'
#' @param rlow For WRC, recession level between two flood peaks in percentage.
#'
#' @param se Should the standard error or the confidence interval be returned.
#'
#' @param ci For `coef` should confidence interval be returned. For `predict`,
#'   method used to compute confidence intervals. Either `profile`,`delta` or `boot`.
#'
#' @param nsim Number of bootstrap samples.
#'
#'
#' @details
#'
#' The access functions \code{coef} and \code{vcov} return respectively the
#' parameters and the variance-covariance matrix of the POT model. For the L-moment
#' method the covariance matrix is using bootstraps.
#' The access function \code{predict} evaluates flood quantiles.
#' If \code{dt} is a Date the return period is computed in year using the range
#' of observation.
#'
#'
#' @seealso \link{which.floodPeaks}, \link{which.clusters}, \link{PlotMrl}.
#'
#' @section References:
#'
#' Coles S. (2001) An introduction to statistical modeling of extreme values.
#'   Springer Verlag.
#'
#' Davison AC, Smith RL. (1990) Models for Exceedances over High Thresholds.
#'   Journal of the Royal Statistical Society Series B (Methodological).
#'   52(3):393–442.
#'
#' @export
#'
#' @examples
#'
#' xd <- rgpa(100, 1, -.2)
#' fit <- FitPot(xd, u = 0)
#'
#' print(fit)
#' vcov(fit)
#' predict(fit)
#' coef(fit, ci = TRUE)
#'
#' fit <- FitPot(flow~date, flowStJohn, u = 1000,
#'                declust = 'wrc', r = 14)
#'
#' print(fit)
#' plot(flow~date,flowStJohn, type = 'l')
#' points(fit$time,fit$excess+fit$u, col = 2, pch = 16)
#' abline(h=1000, col = 3, lwd = 2)
#'
#' predict(fit, se = TRUE, ci = 'delta')
#'

FitPot <- function(x, ...) UseMethod('FitPot',x)

#' @export
#' @rdname FitPot
FitPot.data.frame <- function(obj,  ...)
  FitPot.numeric(x = as.numeric(obj[,2]), dt = as.numeric(obj[,1]), ...)

#' @export
#' @rdname FitPot
FitPot.matrix <- function(obj,  ...)
  FitPot.numeric(x = as.numeric(obj[,2]), dt = as.numeric(obj[,1]), ...)

#' @export
#' @rdname FitPot
FitPot.formula <- function(form, x, ...){
  xd <- model.frame(form,as.list(x))
  FitPot.numeric(x = xd[,1], dt = xd[,2], ...)
}

#' @export
#' @rdname FitPot
FitPot.numeric <- function(x, dt = NULL, u = 0, method = 'mle',
                    declust = 'none', r = 1, rlow = .75, nsim = 1000,
                    varcov = TRUE){

  ## Supply a date variable if not presented
  if(is.null(dt))
    dt <- seq_along(x)

  ## Compute the number of years
  if(class(dt) == 'Date'){

    nyear <- sum(is.finite(dt)) / 365.25

  } else {
    nyear <- NA
  }

  ## If necessary, extract peaks using a declustering method
  if(declust == 'run')
    cid <- which.clusters(x, dt, u, r)
  else if(declust == 'wrc')
    cid <- which.floodPeaks(x, dt, u, r = r, rlow = rlow, ini = 'wrc')
  else
    cid <- x > u

  ans <- list(excess = x[cid] - u, time = dt[cid])

  ## Estimatate the parameters
  if(method == 'lmom'){

    ## Estimatate the parameters
    ans$estimate <- fgpaLmom(ans$excess)

    ## Covariance matrix estimated by boostraps
    if(varcov){
      ans$varcov <- matrix(NA,2,2)

      xboot <- replicate(nsim,
                         sample(ans$excess, length(ans$excess), replace = TRUE))
      pboot <- t(apply(xboot,2,fgpaLmom))

      ## Correct
      vc <- Matrix::nearPD(cov(pboot))$mat
      ans$varcov <- as.matrix(vc)
    }

  } else if(method == 'mle'){
    sol <- fgpa1d(ans$excess, sol = TRUE)
    ans$estimate <- sol$par

    if(varcov){
      ans$varcov <- sol$varcov
      colnames(ans$varcov) <- rownames(ans$varcov) <- names(ans$estimate)
    }

  } else if(method == 'mle2'){
    sol <- fgpa2d(ans$excess, sol = TRUE)

    ans$estimate <- sol$par

    if(varcov){
      ans$varcov <- sol$varcov
      colnames(ans$varcov) <- rownames(ans$varcov) <- names(ans$estimate)
    }

    if( (ans$estimate[2] < -.5 + 1e-4) | (ans$estimate[2] > 1-1e-4) )
      warning(paste('Shape parameter have reach boundary. Normal',
                    ' approximation of the likelihood may be unreliable'))
  }

  ## Complete object
  ans$u <- u
  ans$method <- method
  ans$mrl <-  ans$estimate[1]/ (1+ans$estimate[2])
  ans$nyear <- nyear
  ans$ntot <- length(x)
  ans$nexcess <- length(ans$excess)
  ans$nabove <- sum(x>u)

  class(ans) <- 'fpot'
  return(ans)
}


#' @export
#' @rdname FitPot
coef.fpot <- function(obj, rate = FALSE, ci = FALSE, alpha = 0.05){

  if(rate)
    ans <- c(obj$nexcess/obj$ntot,obj$estimate)
  else
    ans <- obj$estimate

  if(ci){

    ## Full likelihood
    llikFull <- logLik(obj)

    ## Initiate searching intervals for parameters
    ase <- sqrt(obj$varcov[1,1])

    abnd0 <- c(pmax(1e-8, obj$estimate[1] - 10 * ase),
               obj$estimate[1] + 10* ase)

    kbnd0 <- c(-1,2)


    ## Bound for the deviance
    khi <- qchisq(1-alpha,1)

    ## Deviance for the profile likelihood of the scale parameter
    Dscl <- function(z){

      nllik <- function(p) -sum(dgpa(obj$excess, z, p, log = TRUE))

      ## Compute the profile likelihood
      llik0 <- -goptimize(nllik, kbnd0)$objective

      return(2*(llikFull-llik0))

    }

    ## Deviance for the profile likelihood of the shape parameter
    Dshp <- function(z){

      nllik <- function(p) -sum(dgpa(obj$excess, p, z, log = TRUE))

      llik0 <- -optimize(nllik, abnd0)$objective

      return(2*(llikFull-llik0))
    }

    ## Find the boundary

    suppressWarnings(lbScl <-
        goptimize(function(z) abs(Dscl(z)-khi),
                 c(abnd0[1],obj$estimate[1]))$minimum)

    suppressWarnings(ubScl <-
                       goptimize(function(z) abs(Dscl(z)-khi),
                                c(obj$estimate[1],abnd0[2]))$minimum)

    suppressWarnings(lbShp <-
                       goptimize(function(z) abs(Dshp(z)-khi),
                                c(kbnd0[1],obj$estimate[2]))$minimum)

    suppressWarnings(ubShp <-
                       goptimize(function(z) abs(Dshp(z)-khi),
                                c(obj$estimate[2],kbnd0[2]))$minimum)

    bnd <- data.frame(lower = c(NA,lbScl,lbShp),
                      upper = c(NA,ubScl,ubShp))

    rownames(bnd) <- c('rate','alpha','kappa')

    if(!rate) bnd <- bnd[-1,]

    ans <- data.frame(estimate = ans, bnd)

  }

  return(ans)
}

#' @export
#' @rdname FitPot
vcov.fpot <- function(obj, rate = FALSE){

  if(rate){
    ans <- matrix(0,3,3)

    kn <- obj$nexcess/obj$ntot

    ans[1,1] <- kn*(1-kn)/obj$ntot
    ans[2:3,2:3] <- obj$varcov

  } else
    ans <- obj$varcov

  return(ans)
}



#' @export
#' @rdname FitPot
print.fpot <- function(obj){

  cat('\nAnalysis of peaks over threshold')

  cat('\n\nThreshold : ', obj$u)
  cat('\nNumber of excess : ', obj$nexcess)

  if(!is.na(obj$nyear)){
    cat('\nNumber of years : ', format(round(obj$nyear,2)))
    cat('\nExcess rate (yearly): ', format(round(obj$nexcess/obj$nyear,4)))

  } else cat('\nExcess rate: ', format(round(obj$nexcess/obj$ntot,4)))

  cat('\nExtremal index: ', format(round(obj$nexcess/obj$nabove,4)))

  cat('\nDispersion Index : ', format(DispIndex(obj), digits = 4))

  cat('\nMean residual Life : ', format(obj$mrl, digits = 6))

  cat('\n\nParameters :\n')
  print(obj$estimate, digit = 4)

  if(!is.null(obj$varcov)){
    cat('\nstd.err :\n')
    print(sqrt(diag(obj$varcov)), digit = 4)
  }
}

#' @export
#' @rdname FitPot
predict.fpot <- function(obj, q = c(.5, .8, .9, .95, .98, .99), se = FALSE,
                          ci = "none", alpha = 0.05, nsim = 1000, ...){

  ## Transform the probability in return period
  rt <- 1/(1-q)

  ## If the excess rate is not provided
  if(!is.na(obj$nyear))
    mrate <- rt * 365.25
  else
    mrate <- rt

  lambda <- mrate * obj$nexcess / obj$ntot

  ## funtion that compute the return period
  fpred <- function(para){

    if(abs(para[2]) > 1e-8)
      ans <- obj$u + para[1]/para[2] * (1-lambda^(-para[2]) )
    else
      ans <- obj$u + para[1] * log(lambda)

    return(ans)
  }

  hatRt <- fpred(obj$estimate)

  ## Standard error by delta method
  if(se | ci == 'delta'){
    negk <- -obj$estimate[2]
    lk <- lambda^negk
    ak <- obj$estimate[1]/negk

    gx1 <-  obj$estimate[1] * (obj$ntot/obj$nexcess)  * lk
    gx2 <- (lk-1)/negk
    gx3 <-  ak * ( lk * log(lambda) - gx2)

    gx <- rbind(gx1, gx2, gx3)
    hatSe <-  sqrt(diag(t(gx) %*% vcov(obj, rate = T) %*% gx))
  }

  ## Confident interval by delta method
  if(ci == 'delta'){
     lb <- hatRt - qnorm(1-alpha/2) * hatSe
     ub <- hatRt + qnorm(1-alpha/2) * hatSe
  }

  ## Confident interval by nonparametric bootstrap
  if(ci == 'boot'){

    ## Resample
    xboot <- replicate(nsim, sample(obj$excess, obj$nexcess, replace = TRUE))

    ## Refit and predict
    if(obj$method == 'lmom')
      fn <- function(z) fpred(fgpaLmom(z))
    else if(obj$method == 'mle2')
      fn <- function(z) fpred(fgpa2d(z))
    else if (obj$method == 'mle')
      fn <- function(z) fpred(fgpa1d(z))

    pboot <- apply(xboot,2,fn)

    ## Compute interval
    bootci <- t(apply(pboot, 1, quantile, c(alpha/2, 1-alpha/2)))
    lb <- bootci[,1]
    ub <- bootci[,2]

  }

  ## Confident interval by profile likelihood
  if(ci == 'profile'){

    bnd0 <- c(-.5,1)

    ## Bound of the deviance
    khi <- qchisq(1-alpha,1)
    lb <- lb0 <- pmax(1e-8, hatRt - 10 * hatSe)
    ub <- ub0 <- hatRt + 10 * hatSe

    for(ii in seq_along(rt)){

      ## Deviance function in respect of the return period
      Dfun <- function(z){

        ## Full likelihood
        llikFull <- sum(dgpa(obj$excess, obj$estimate[1],
                                         obj$estimate[2], log = TRUE))

        ## Negative log-likelihood
        nllik <- function(p){

          num <- z - obj$u

          if(abs(p) < 1e-8)
            denum <- log(lambda[ii])
          else
            denum <- -(lambda[ii]^(-p) -1)/p

          return(-sum(dgpa(obj$excess, pmax(1e-8,num/denum), p, log = TRUE)))

        }

        ## Compute the profile likelihood
        llik0 <- -goptimize(nllik, bnd0)$objective
        return(2*(llikFull-llik0))

      }

      ## Lower bound
      suppressWarnings(lb[ii] <-
                goptimize(function(z) abs(Dfun(z) - khi),
                         c(lb0[ii],hatRt[ii]))$minimum)

      ## Upper bound
      suppressWarnings(ub[ii] <-
                         goptimize(function(z) abs(Dfun(z) - khi),
                                  c(hatRt[ii], ub0[ii]))$minimum)

    } # end for

  } # end if

  ## Does ci was computed
  ci <- !(ci=='none')

  ## construct the output
  if(se & !ci)
    ans <- data.frame(rt = hatRt, se = hatSe)
  else if(!se & ci)
    ans <- data.frame(rt = hatRt, lb = lb, ub = ub)
  else if(se & ci)
    ans <- data.frame(rt = hatRt, se = hatSe, lb = lb, ub = ub)
  else
    ans <- data.frame(rt = hatRt)


  rt <- round(rt,4)

  if(ncol(ans) == 1){
    ans <- ans[,1]
    names(ans) <- paste('Q', rt, sep='')
  } else
    rownames(ans) <- paste('Q', rt, sep='')

  return(ans)
}


#' @export
logLik.fpot <- function(obj)
  sum(dgpa(obj$excess, obj$estimate[1],obj$estimate[2], log = TRUE))

#' @export
AIC.fpot <- function(obj, k = 2)
  -2 * logLik(obj) + 2*k

#############################################################
#' Diagnostic tools for peaks over threshold analysis
#'
#' Produce mean residual Life, quantite-quantile,
#' probability-probability,
#' histogram of the excesses,
#' Stability plot for the parameter of the GPA.
#'
#' @param x,dt,form Sample and time of observation. A formula can also
#'   be passed, in which case \code{x} is a data.frame.
#'
#' @param u Series of candidate thresholds.
#'
#' @param method Estimation method.
#'
#' @param alpha Confidence interal with level \code{1-alpha/2}.
#'
#' @param param Choice of the parameter to display. Either
#'  \code{'alpha'} or \code{'kappa'}.
#'
#' @param ... Others arguments are passed to function \code{plot} and
#' \code{\link{which.floodPeaks}}
#'
#' @details
#'
#' The function \code{PlotPotShape} will loop across calls of function
#' \code{\link{FitPot}} and will produce a graphics of the
#' shape parameter in respect of the threshold \code{u}. See the respective
#' functions for a description of the function parameters.
#'
#' @seealso \link{FitPot}, \link{which.floodPeaks}.
#'
#'
#' @examples
#'
#' ## Find list of candidate thresholds
#' lstu <- seq(500,2500, len = 50)
#'
#' PlotMrl(flow~date,flowStJohn, u = lstu, declust = 'wrc', r = 14)
#'
#' PlotPotShape(flow~date,flowStJohn, u = lstu, declust = 'wrc', r = 14)
#' PlotPotShape(flow~date,flowStJohn, u = lstu,
#'             declust = 'wrc', r = 14, param = 'alpha', method = 'mle2')
#'
#'
#' fit <- FitPot(flow~date,flowStJohn, u = 997, declust = 'wrc', r = 14)
#' ppplot(fit)
#' qqplot(fit, ci = TRUE)
#' hist(fit)
#'
#'
#' @export
PlotMrl <- function(form,x, u,
                    declust = NULL, r = 1, rlow = 0.75,
                    alpha = 0.05,
                    ylab = 'Mean Residual Life', xlab = 'Threshold',
                    col = 'black', lty = 1, lwd = 1,
                    col.ci = 'black', lty.ci = 3, lwd.ci = 1,
                    ylim = NULL, display = TRUE, ...){

  ## reorganize the data
  x <- model.frame(form,x)
  dt <- x[,2]
  x <- x[,1]

  u <- as.numeric(u)

  sig <- mrl <- nn <- rep(NA,length(u))
  for(ii in seq_along(u)){

    ## Declustering
    if(is.null(declust))
      cid <- x > u[ii]
    else if(declust == 'run')
      cid <- which.clusters(x, u[ii], r)
    else if( declust == 'wrc')
      cid <- which.floodPeaks(x, dt, u[ii], r = r, rlow = rlow)

    excess <- x[cid]-u[ii]
    mrl[ii] <- mean(excess)
    nn[ii] <- length(excess)
    sig[ii] <- sd(excess)/sqrt(nn[ii])

  }

  # Confident intervals
  qn <- qnorm(1-alpha/2)
  lb <- mrl - qn * sig
  ub <- mrl + qn * sig

  ## Adjusted limits
  if(is.null(ylim)) ylim <- range(c(lb,ub)) * c(.95,1.05)

  if(display){
    plot(u, mrl, xlab = xlab, ylab = ylab, ylim = ylim,
         col = col, lty = lty, lwd = lwd, ...)
    lines(u, lb, lty = lty.ci, col = col.ci, lwd = lwd.ci)
    lines(u, ub, lty = lty.ci, col = col.ci, lwd = lwd.ci)
  }

  ## Return
  invisible(data.frame(u = u, n = nn,mrl = mrl, lb = lb, ub = ub))

}

#################################################################
#' Dispersion index
#'
#' Returns the dispersion index (or variance to mean ratio) to verify
#' the hypothesis of a Homogenous Poisson process.
#'
#' @param obj An output of \code{FitPot}.
#'
#' @param k Number of classes. Default value chosen to ensure at least 5 classes
#'   and aims at approximately an average 5 events per classe.
#'
#' @param ci Should confident interval and p-value be returns.
#'
#' @param alpha Significance level.
#'
#' @param method Method for computing the confident interval and the p-value.
#'   Either 'asymptotic' for chi-squared approximation or 'boot' for bootstrap.
#'
#' @param nsim Number of bootstrap samples.
#'
#' @details
#'
#' The function \code{PlotDispIndex} will loop across calls of function
#' \code{\link{FitPot}} and \code{DispIndex} and will produce a graphics of the
#' Dispersion index in respect of the threshold \code{u}. See respective
#' functions for a description of the function parameters. The points in the
#' plot are returns as a data.frame and parameter \code{display} can be set
#' to \code{FALSE} to prevent the display of the graphic.
#'
#' @section Reference:
#'
#' Lang M, Ouarda TBMJ, Bobée B. (1999) Towards operational guidelines for
#'   over-threshold modeling. Journal of Hydrology. Dec 6;225(3):103–17.
#'
#' @seealso \link{PlotMrl}
#'
#' @export
#'
#' @examples
#'
#' fit <- FitPot(flow~date,flowStJohn, u = 1000,
#'               declust = 'wrc', r = 14)
#'
#' DispIndex(fit, ci = TRUE)
#'
#' PlotDispIndex(flow~date,flowStJohn,
#'               u = seq(500,2000, len = 50),
#'               declust = 'wrc', r = 14)
#'
DispIndex <- function(obj, k = NULL, ci = FALSE, alpha = 0.05,
                      method = 'asymptotic', nsim = 1e4){

  ## form equal groups and count peaks
  ntot <- length(obj$time)

  if(is.null(k))
    k <- pmax(5,floor(ntot/5))

  nn <- table(cut(obj$time, k, labels = FALSE))

  ## Compute the dispersion index and p-value
  vmr <- var(nn)/mean(nn)

  if(ci){
    a2 <- alpha/2

    if(method == 'asymptotic'){

      ## Verification of minimal conditions for Chi-squared approximation
      if(mean(nn) < 1 ) warning('Low average mean per classes')
      if(k < 5 ) warning('Too few classes for testing')


      ## Confident intervale
      d <- k-1
      myci <- c(qchisq(a2, df = d), qchisq(1-a2, df = d))/d

      ## Compute p-value assuming asymetrical probabilities
      if(vmr < 1) pval <- 2 * pchisq(vmr*d, df = d)
      else if (abs(vmr-1) < 1e-8) pval <- 1
      else pval <- 2 * (1-pchisq(vmr*d,df = d))



    } else if(method == 'boot') {

      ## sample from poisson distribution
      bfun <- function(){
        z <- rpois(k,sum(nn)/k)
        var(z)/mean(z)
      }

      bsample <- replicate(nsim, bfun())

      ## Boostrap intervals and p-value
      myci <- quantile(bsample, c(a2,1-a2))

      if(vmr < 1) pval <- 2 * mean( bsample <= vmr)
      else if (abs(vmr-1) < 1e-8) pval <- 1
      else pval <- 2*(1-mean( bsample >= vmr))
    }

    ans <- c(vmr, myci , pval)
    names(ans) <- c('vmr','lb','ub', 'pval')

  } else ans <- vmr

  return(ans)
}