floodStat: FloodNet Regional Frequency Analysis

###############################################################################
#' Fitting a index-flood model on a pooling groups
#'
#' Returns a pooling group model with regional parameters and common
#' homogeneity statistics diagnostics.
#' The function can be used for analysing annual maximum or peaks over threshold.
#'
#' @param lmom Matrix of the at-site L-moments of all sites.
#'
#' @param intersite An intersite object. Alternative to pass L-moments directly.
#'
#' @param nrec Record Length of each site. If a single value is passed, it is
#' applied to all sites.
#'
#' @param nk Size of the desired pooling groups.
#'
#' @param distr At-site distribution of each site. If a single value is passed, it is
#' applied to all sites.
#'
#' @param distance Vector of distance with the target site.
#' The target is included with distance zero.
#'
#' @param method Type of data used. Either annual maximums (`amax`) or
#' peaks over threshold (`pot`).
#'
#' @param diagnostic Shoud the diagnostics homogeneity and Z-score be evaluated.
#'
#' @param diagnostic.nsim Number of simulations used to evaluate the diagnostics.
#'
#' @param pvalue Logical. Should the p-value of the homogeneity test be computed?
#'
#' @param pvalue.nrep,pvalue.nsim Number of repetitions and simulations used for
#' evaluating the p-value.
#'
#' @param inter Type of intersite correlation used for simulation. For `'mat'`the
#' empirical correlation matrix is used directly.
#' For `'avg'` the average coefficient of correlation is used.
#'
#'
#' @param inter.corr Intersite correlation matrix of all sites.
#'
#' @return
#'
#' \item{id}{List of the selected sites}
#' \item{nrec}{Record lengths.}
#' \item{distance}{Distance matrix.}
#' \item{lmom}{Matrix of L-moments.}
#' \item{rlmom}{Regional L-moments.}
#' \item{ntot}{Number of station-year.}
#' \item{distr}{Regional distribution}
#' \item{para}{Parameters of the regional distribution}
#' \item{method}{Type of analysis. Annual maximum (`amax`) or peaks over threshold(`pot`)}
#' \item{stat}{Homogenous criteria and Z-score for goodness-of-fit.}
#' \item{pvalue}{P-values of `stat`}
#' \item{discord}{Discordance measures.}
#' \item{inter}{Type of intersite correlation model used.}
#' \item{inter.corr}{Intersite correlation matrix}
#' \item{inter.avg}{Pairwise average of the intersite correlation.}
#'
#' @section References:
#'
#' Hosking, J. R. M., & Wallis, J. R. (1997). Regional frequency analysis:
#'   an approach based on L-moments. Cambridge Univ Pr.
#'
#' Mostofi Zadeh, S., Durocher, M., Burn, D.H., Ashkar, F., 2019.
#'   Pooled flood frequency analysis: a comparison based on peaks-over-threshold
#'   and annual maximum series. Hydrological Sciences Journal 0, null.
#'   https://doi.org/10.1080/02626667.2019.1577556
#'
#' @export
#'
#' @examples
#'
#' isite <- Intersite(ams ~ id + year, flowAtlantic$ams,
#'                    distance = flowAtlantic$distance,
#'                    smooth = TRUE)
#'
#' fit <- PoolGroup(isite, distance = 2, nk = 15)
#' print(fit)
#'
#' ## Create a L-moment ratio diagram.
#' plot(fit)
#'
############################################################################
PoolGroup <- function(x, ...) UseMethod('PoolGroup',x)

#' @export
PoolGroup.intersite <- function(
  obj,
  distance,
  nk,
  inter = 'mat',
  inter.corr = NULL,
  ...)
{

 if (is.null(inter.corr))
   inter.corr <- obj$corr

 if(length(distance) == 1){
   ## verify that distance was passed
   if(is.null(obj$distance))
     stop('Must specify a distance.')



   h <- obj$distance[distance,]

 } else {
   h <- distance
 }



 PoolGroup(obj$lmom, nrec = obj$nrec, distance = h, nk = nk,
           inter = inter, inter.corr = inter.corr, ...)
}


#' @export
PoolGroup.matrix <-function(
  lmom,
  nrec,
  distance,
  nk,
  distr = NULL,
  method = 'amax',
  diagnostic = FALSE,
  diagnostic.nsim = 1000,
  pvalue = FALSE,
  pvalue.nrep = 500,
  pvalue.nsim = 500,
  inter = 'ind',
  inter.corr = 0,
  lscale = FALSE)
{

  ## Number of site and corrected maximum number of site
  nk <- min(nk, nrow(lmom))

  ## Transform the second L-moment if not the LCV
  if (lscale)
    lmom[,2] <- lmom[,2]/lmom[,1]

  if(ncol(lmom) < 4)
    stop('Must have at least 4 L-moments')

  ##---------------------------##
  ## Identify the neighborhood ##
  ##---------------------------##

  ## Find the nearest sites and re-order the data
  nid <- FindNearest(distance,nk)
  lmom <- lmom[nid,]
  distance <- distance[nid]
  nrec <- nrec[nid]

  ## Evaluate the number of station-year
  ntot <- sum(nrec)

  ## Reformat the if a constant value
  if (length(inter.corr)>1){
    inter.corr <- as.matrix(Matrix::nearPD(inter.corr[nid,nid])$mat)
    inter.avg <- inter.corr[lower.tri(inter.corr)]
    inter.avg <- max(mean(inter.avg),0)
  } else {
    inter.avg <- inter.corr
  }

  ##---------------------------------------##
  ## Estimation and homogenity test
  ##---------------------------------------##

  ## compute the regional lmoments
  dd <- lmomRFA::as.regdata(cbind(nid,nrec,lmom), FALSE)
  rlmom <- lmomRFA::regavlmom(dd)
  names(rlmom) <- c('L1','LCV','LSK','LKUR','T5','T6')[seq_along(rlmom)]

  ## POT method always use Pareto distribution
  if (method == 'pot')
      distr <- 'gpa'

  ## If no distribution is passed, the diagnostic routine
  ## must be performed to select the growth curve
  if (is.null(distr) & method == 'amax')
    diagnostic <- TRUE

  ## If necessary evaluate the statistics H and Z
  if (diagnostic){

    tst <- lmomRFA::regtst(dd, diagnostic.nsim)

    stat <- c(tst$H, tst$Z)
    names(stat) <- c('H1','H2','H3','Zglo','Zgev','Zgno','Zpe3','Zgpa')
    discord <- tst$D

    if (is.null(distr))
      distr <- c('glo','gev','gno','pe3')[which.min(abs(stat[4:7]))]

  } else {
    ## No diagnostic performed
    stat <- NULL
    discord <- NULL

  }

  ##-----------------------------------------##
  ## Evaluating P-value of the homogeneity test
  ##-----------------------------------------##

  ## Don't perform p-value simulation if statistics not computed
  if (is.null(stat))
    pvalue <- FALSE

  if (pvalue){

    ## Choose the correlation with respect to "inter"
    if(inter == 'mat'){
      corr0 <- inter.corr
    } else {
      corr0 <- inter.avg
    }

    ## Compute the null distribution of H and Z
    h0 <- lmomRFA::regsimh(lmom::quakap, rlmom,
                   cor = corr0,
                   nrec = nrec,
                   nrep = pvalue.nrep, nsim = pvalue.nsim)$results

    pval <- rep(0,length(stat))

    ## Compute p-value of homogeneity test (unilateral)
    for(kk in 1:3)
      pval[kk] <- mean(stat[kk] < h0[kk,])

    ## Compute p-value of Z-score (bilateral)
    for(kk in 4:8)
      pval[kk] <- mean(abs(stat[kk]) < abs(h0[kk,]))

    names(pval) <- names(stat)

  } else pval <- NULL

  ## Estimate model parameters
  if (method == 'amax'){
    v <- lmomco::vec2lmom(rlmom, lscale = FALSE)
    para <- lmomco::lmom2par(v, distr)$para

  } else if (method == 'pot'){
    k <- 1/rlmom[2]-2
    para <- c(0, 1 + k, k)
    names(para) <- c('xi','alpha','kappa')

  }

  ans <- list(id = nid,
              nrec = nrec,
              lmom = lmom,
              rlmom = rlmom,
              ntot = ntot,
              distr = distr,
              para = para,
              method = method,
              stat = stat,
              pvalue = pval,
              discord = discord,
              distance = distance,
              inter = inter,
              inter.corr = inter.corr,
              inter.avg = inter.avg)

  class(ans) <- 'poolgrp'

  ans
}

#' @export
print.poolgrp <- function(obj)
{
  cat('\nRegional frequency analysis with pooling groups\n')
  cat('\nMethod:', obj$method)
  cat('\nNb. station:', nrow(obj$lmom))
  cat('\nStation-year:', obj$ntot)

  if (obj$inter == 'mat'){
    cat('\nIntersite:', dim(obj$inter.corr) ,'(matrix)\n')
  } else {
    cat('\nIntersite: ', round(obj$inter.avg,2),' (', obj$inter,')', sep= '')
  }

  if (any(!is.null(obj$stat))){
    cat('\nHomogeneity:', round(obj$stat[1:3],2))

    if (obj$method == 'amax'){
      cat('\n\nZ-scores (absolute):\n')
      print(round(abs(obj$stat[4:8]),2))

    } else cat('\n')
  }

  cat('\nRegional L-moments:\n')
  print(obj$rlmom, digits = 4)

  cat('\nDistribution:', obj$distr)

  cat('\nParameter:\n')
  print(obj$para, digits = 4)

  if (!is.null(obj$pvalue)){
    cat('\np-values:\n')
    print(obj$pvalue)
  }

}


#' @export
plot.poolgrp <- function(obj){
  lmm <- obj$lmom[,3:4]
  colnames(lmm) <- c('t_3','t_4')
  lmom::lmrd(lmm)
  points(obj$rlmom[3],obj$rlmom[4], pch = 16, col = 'red', cex = 1.5)
}

############################################################################
#' Remove heterogenous sites from pooling groups
#'
#' Return a pooling groups object where one site is removed.
#' The selected site is the one that best improve the
#' homogeneity score.
#' The function `Poolremove.auto` proceed step-by-step and remove sites until
#' the heterogeous criteria or a stopping rule is met.
#'
#' @param obj A pooling group object.
#'
#' @param method Which homogeneity statistics used in the procedure. Choices :
#' `H1`, `H2`, `H3`.
#'
#' @param nsim Number of simulation used to evaluate the homogenous criteria
#'
#' @param distr.fix Logical, should the selection of the distribution be
#' reevaluated after removing the site.
#'
#' @export
#'
#' @examples
#'
#' isite <- Intersite(ams ~ id + year, flowAtlantic$ams,
#'                    distance = flowAtlantic$distance,
#'                    smooth = TRUE)
#'
#' fit <- PoolGroup(isite, distance = 2, nk = 30)
#'
#' ## remove the site no. 1, see fit$id for a list of the available site
#' PoolRemove(fit, id = 1)$stat[1:3]
#'
#' PoolRemove.auto(fit, tol = 1.5)$stat[1:3]
#'
############################################################################
PoolRemove <- function(
  obj,
  id = NULL,
  method = 'H1',
  nsim = 1000,
  distr.fix = TRUE)
{
  ## Number of site
  nk <- nrow(obj$lmom)

  ## Which homogeneity statistic to use
  r <- which(method == c('H1','H2','H3'))

  ## Create a regdata object
  dd <- lmomRFA::as.regdata(cbind(obj$id, obj$nrec, obj$lmom), FALSE)

  ## Compute the homogeneity if not already done in obj
  if (is.null(obj$stat)){
     hmg <- lmomRFA::regtst(dd)$H
  } else {
      hmg <- obj$stat[1:3]
  }

  ## Function for computing the homogeneity H without a specific station
  FunHmg <- function(ii) lmomRFA::regtst(dd[-ii,],nsim)

  if(!is.null(id)){

    mid <- which(id == obj$id)

    ## verify that the site to remove is in the pooling group
    if(is.null(mid))
      stop("Must selected a site in the pooling group")

    tst0 <- FunHmg(mid)

  } else {

    ## find the stations that improve best H
    ## Note that the target is always the first station and must be kept
    tst.new <- lapply(1:nk, FunHmg)
    hgm.all <- sapply(tst.new, getElement, 'H')
    mid <- which.min(abs(hgm.all[r,-1])) + 1
    tst0 <- tst.new[[mid]]
  }
  ## update the pooling group object
  obj$id <- obj$id[-mid]
  obj$lmom <- obj$lmom[-mid,]
  obj$nrec <- obj$nrec[-mid]
  obj$ntot <- sum(obj$nrec)
  obj$distance <- obj$distance[-mid]

  obj$stat[1:8] <- c(tst0$H, tst0$Z)

  obj$discord <- tst0$D

  ## update the correlation matrix
  if (length(obj$inter.corr)>1){
    inter.corr <- obj$inter.corr[-mid,-mid]
    obj$inter.corr <- as.matrix(Matrix::nearPD(inter.corr)$mat)

    inter.avg <- inter.corr[lower.tri(inter.corr)]
    obj$inter.avg <- max(mean(inter.avg),0)

  } else {
    inter.avg <- inter.corr
  }

  ## For amax, update the selected distribution
  if (!distr.fix & obj$method == 'amax')
    obj$distr <- c('glo','gev','gno','pe3')[which.min(abs(obj$stat[4:7]))]

  ## Estimate the new regional L-moments
  obj$rlmom <- lmomRFA::regavlmom(dd[-mid,])

  ## Refit the growth curve with maybe new distribution
  UpdateDistr(obj, obj$distr)
}

#' @export
PoolRemove.auto <-
  function(obj, method = 'H1', tol = 2, nmin = 15, ntot.min = 0, nsim = 1000,
          verbose = TRUE, distr.fix = FALSE){

  ## Select the criteria for evaluating the homogeneity
  r <- which(c('H1','H2','H3') == method)

  if (obj$stat[r] <= tol){
    warning('The pooling group is already sufficiently homogenous')
    return(obj)
  }

  repeat{

    ## monitoring (if require)
    if (verbose){
      print(paste('n :', nrow(obj$lmom),
                  ', h:', round(obj$stat[r],2),
                  ', t:', obj$ntot, sep = '') )
    }

    ## remove one
    suppressWarnings(obj.new <- PoolRemove(obj, method = method, nsim = nsim))

    ## Verify that the new pooling group has enough station-years
    if ((obj.new$ntot < ntot.min) | (length(obj.new$id) < nmin)){
      warning('The minimal number of stations or station-years have been reached')

      ans <- obj
      break
    }

    ## If a sufficient heterogeous criterion has been reached
    if (obj.new$stat[r] <= tol){
      ans <- obj.new
      break
    }

    ## iterate
    obj <- obj.new

  }

  ## Find the new best distribution if necessary
  if (!distr.fix & ans$method == 'amax')
    distr <- c('glo','gev','gno','pe3')[which.min(abs(ans$stat[4:7]))]
  else
    distr <- ans$distr

  ## Update parameters
  return(UpdateDistr(ans, distr))

}

UpdateDistr <- function(obj, distr)
{
  obj$distr <- distr

  ## Estimate parameters
  if (obj$method == 'amax'){
    v <- lmomco::vec2lmom(obj$rlmom, lscale = FALSE)
    obj$para <- lmomco::lmom2par(v, obj$distr)$para

  } else if (obj$method == 'pot'){
    k <- 1/obj$rlmom[2]-2
    obj$para <- c(1 + k, k)

  }

  return(obj)
}

############################################################################
#' Flood quantiles estimated at the target of a pooling group
#'
#' Predict the flood quantile of a given return period for its target sites.
#'
#' @param obj Pooling group object.
#'
#' @param q Probability associated to the flood quantiles.
#'
#' @param indx Index flood factor. By default the sample average of the target.

#' @param ci Logical. Should the confident intervals and the standard deviation
#' be evaluated?
#'
#' @param nsim Number of simulation used for approximating the confident intervals.

#' @param alpha Significance level.
#'
#' @section References:
#'
#' Hosking, J. R. M., & Wallis, J. R. (1997). Regional frequency analysis:
#'   an approach based on L-moments. Cambridge Univ Pr.
#'
#' @import lmomRFA
#' @import lmom
#'
#' @export
#'
#' @examples
#' isite <- Intersite(ams ~ id + year, flowAtlantic$ams,
#'                    distance = flowAtlantic$distance,
#'                    smooth = TRUE)
#'
#' fit <- PoolGroup(isite, distance = 2, nk = 30)
#'
#' predict(fit, c(.3,.7))
#' predict(fit, ci = TRUE)
#'
predict.poolgrp <-function(
  obj,
  q = c(.5, .8, .9, .95, .98 , .99),
  indx = NULL,
  ci = FALSE,
  nsim = 5000,
  alpha = 0.05)
{
  ## Probability in term of return period.
  rt <- 1/(1-q)

  ## Default index-flood is the mean of the site
  if (is.null(indx))
    indx <- obj$lmom[,1]

  ## Extract quantile function
  qfunc <- getFromNamespace(paste0('qua',obj$distr), 'lmom')

  if (ci){

    ## extract correlation matrix or value
    if (obj$inter == 'mat'){
      corr0 <- obj$inter.corr
    } else {
      corr0 <- obj$inter.avg
    }

    ## compute the parameter at each site
    ffunc <- getFromNamespace(paste0('pel',obj$distr), 'lmom')
    lmom <- obj$lmom
    lmom[,2] <- lmom[,2]*lmom[,1]
    para <- apply(lmom,1,ffunc)

    ## Alternative code: more robust version
    #ffunc <- function(v){
    #  z <- lmomco::vec2lmom(v, lscale = FALSE)
    #  lmomco::lmom2par(z,obj$distr)$para
    #}

    #para <- apply(obj$lmom,1,ffunc)

    ## Create a regfit object
    dd <- as.regdata(cbind(obj$id,obj$nrec,obj$lmom),FALSE)
    rfit <- regfit(dd,obj$distr)

    ## simulate flood quantile
    simq <- regsimq(qfunc,
                   para = t(para),
                   cor = corr0,
                   index = indx,
                   nrec = obj$nrec,
                   nrep = nsim,
                   fit = obj$distr,
                   f = q,
                   boundprob = c(alpha/2, 1-alpha/2))

    ## extract confident interval and standard deviation
    ans <- sitequantbounds(simq, rfit, site=1)
    ans <- as.matrix(ans)
    colnames(ans) <- c('rt', 'pred', 'se', 'lower','upper')
    ans[,1] <- rt

  } else {
    ## Compute the flood quantiles
    ans <- indx[1] * qfunc(q, obj$para)
    names(ans) <- paste0('rt',round(rt,1))
  }

  return(ans)
}