floodStat: FloodNet Regional Frequency Analysis

#############################################################################
#' Simulation of a regional dataset with intersite correlation.
#'
#' Returns a dataset containing multiple time series in the form of
#' a matrix where the sites are in columns.
#' Different record lengths can be specified for each site
#' and missing values are filled accordingly at the beginning.
#' The rows (time) are independent and the intersite correlation model
#' is based on a multivariate Normal distribution.
#'
#' @param x,obj Matrix (in rows) of parameters or L-moments for all sites to simulate.
#' Can also be a pooling group or intersite model, where key arguments are extracted
#' from the input object.
#'
#' @param distr Marginal distribution of each site.
#' If only one value is passed as argument, the same is used for all sites.
#'
#' @param nrec Record lengths of the sites.
#' If only one value is passed in argument, the same is used for all sites.
#'
#' @param corr Correlation matrix for the dependence between site.
#' If only one value is passed, the correlation is assumed
#' the for every pair of sites.
#'
#' @param corr.sqrt Squared correlation matrix. Can be passed to speed up
#'    multiple calls.
#'
#' @param lmom Logical. Is the argument `x` a matrix of L-moments or
#' distribution parameters
#'
#' @param lscale Logical. Is the second L-moments the scale (`TRUE`)
#' or the LCV (`FALSE`).
#'
#' @param long  Logical. Should the output be returned in a long format.
#'
#' @export
#'
#' @examples
#'
#' ## Extract data
#' data(canFlood)
#' lmom <- canFlood[1:5, c('L1','LCV','LSK')]
#'
#' ## Simulate data base on at-site L-moments
#' sim <- RegSim(lmom, distr = 'gev', nrec = 11:15, corr = .4)
#'
#' head(sim)
#'
#' sim <- RegSim(lmom,
#'               distr = c(rep('gev',4),'gno'),
#'               nrec = 10:15,
#'               corr = .4, long = TRUE)
#' head(sim)
#'
#' ############################################################################
RegSim <- function(x, ...) UseMethod('RegSim', x)

#' @export
RegSim.data.frame <- function(x, ...)
  RegSim.matrix(as.matrix(x), ...)

#' @export
#' @rdname RegSim
RegSim.matrix <-
  function(
    x,
    distr,
    nrec,
    corr = 0,
    corr.sqrt = FALSE,
    lmom = TRUE,
    lscale = FALSE,
    long = FALSE)
{

  ## Extract info
  nsite <- nrow(x)
  nrec.max <- max(nrec)

  ## Expend distr if constant
  if(length(distr) == 1)
    distr <- rep(distr, nsite)

  ## Create a matrix if only a scalar is passed as correlation
  if (length(corr) == 1){
    corr <- matrix(corr, nsite, nsite)
    diag(corr) <- 1
  }

    ## Create a matrix if only a scalar is passed as correlation
  if (length(nrec) == 1){
    nrec <- rep(nrec,nsite)
  }

  ## Decomposed the correlation matrix if necessary
  if( any(is.na(corr)))
    stop('Missing value in the correlation matrix')

  if (corr.sqrt){
    corr.L <- corr
  } else {
    corr.L <- chol(corr)
  }

  ## Simulate a normal copula
  u <- matrix(rnorm(nrec.max * nsite), nrec.max, nsite)
  u <- pnorm(u %*% corr.L)

  ## ------------------------------------------ ##
  ## Convert L-moment to parameter if necessary
  ## ------------------------------------------ ##

  if (lmom){
    ## Extract list of functions that convert L-moments to parameter
    ffunc <- lapply(distr,
                  function(d) getFromNamespace(paste0('pel',d),'lmom'))

    ## If LCV is passed correct the 2nd L-moments
    if(lscale == FALSE)
      x[,2] <- x[,2] * x[,1]

    ## Estimate parameters for every site
    para <- lapply(1:nrow(x), function(kk) ffunc[[kk]](x[kk, ]))

  } else {

    ## If parameter were passed as matrix, transform to list
    para <- lapply(split(x,1:nrow(x)), na.omit)
  }

  ## Extract a list of quantile functions for each site
  qfunc <- lapply(distr, function(d){
             getFromNamespace(paste0('qua',d),'lmom')
           })

  ## Transform uniform data to proper marginal distribution
  ans <- sapply(1:nsite, function(kk){
            qfunc[[kk]](u[,kk], para[[kk]])
          })

  ## Adjust record length by adding NA to early data
  Rlen <- function(x,n){
    nrm <- length(x)-n

    if(nrm > 0)
      x[1:nrm] <- NA

    return(x)
  }

  ans <- sapply(1:nsite, function(kk) Rlen(ans[,kk], nrec[kk]))

  ## ------------------------------------- ##
  ## Transform the data in the long format
  ## ------------------------------------- ##

  if(long){

   ans <- cbind(expand.grid(time = 1:nrow(ans), site = 1:ncol(ans)),
                   value = as.numeric(ans))

   ans <- ans[!is.na(ans$value), ]

  }

  return(ans)
}

#' @export
#' @rdname RegSim
RegSim.intersite <- function(obj, distr, n = 1)
{
  ## Extract dimension
  nsite <- nrow(obj$lmom)
  nmax <- max(obj$nrec)

  ## Pre decompose the correlation matrix for faster simulation
  corr.L <- chol(obj$corr)

  ## -------------------- ##
  ## Perform simulation
  ## -------------------- ##

  if(n == 1){
    ## Return a matrix if only one dataset is simulated
    ans<- RegSim(obj$lmom,
                distr = distr,
                nrec = obj$nrec,
                corr = corr.L,
                corr.sqrt = TRUE)

  } else {

    ans <- replicate(n, RegSim(obj$lmom,
                               distr = distr,
                               nrec = obj$nrec,
                               corr = corr.L,
                               corr.sqrt = TRUE))
  }

  return(ans)
}

#' @export
#' @rdname RegSim
RegSim.poolgrp <- function(obj, n = 1, long = FALSE, ...)
{

  nsite <- nrow(obj$lmom)
  nmax <- max(obj$nrec)

  ## ---------------------------------------------------------- ##
  ## Pre decompose the correlation matrix for faster simulation
  ## ---------------------------------------------------------- ##

  if (obj$inter == 'avg'){
    ## Create a matrix with constant correlation coefficients
    cor0 <- matrix(obj$inter.avg, nsite, nsite)
    diag(cor0) <- 1
    cor0 <-as.matrix(Matrix::nearPD(cor0, corr = TRUE)$mat)
  } else{
    cor0 <- obj$inter.corr
  }

  corr.L <- chol(cor0)

  ## ---------------------------------------------------------- ##
  ## Perform simulation
  ## ---------------------------------------------------------- ##

  lmom0 <- t(replicate(nsite,obj$rlmom))

  Sim1 <- function(){
    out <- RegSim(lmom0, distr = obj$distr, nrec = obj$nrec,
                  corr = corr.L, corr.sqrt = TRUE, long = FALSE)
      mapply('*', as.data.frame(out), obj$lmom[,1])
  }

  ans <- replicate(n, Sim1())
  colnames(ans) <- obj$id

  ## Case there is one simulation
  if(n == 1)
    ans <- ans[ , , 1]

  if (n == 1 & long){
    ids <- expand.grid(1:nrow(ans),
                       1:ncol(ans))[,2:1]
    ans <- cbind(ids, as.numeric(ans))
    colnames(ans) <- c('site','time','value')

  }

  ## Case
  if (n >1 & long) {
    ids <- expand.grid(1:nrow(ans),
                       1:ncol(ans),
                       1:n)[,3:1]

    ans <- cbind(ids, as.numeric(ans))
    ans <- na.omit(ans)
    colnames(ans) <- c('rep','site','time','value')
  }

  return(ans)
}


############################################################################
#' Modeling intersite correlation
#'
#' Returns an intersite correlation model.
#' The function computes the sample L-moments and estimates the empirical
#' intersite correlation matrix between sites.
#' A power exponential model can be fitted to smooth the intersite correlations
#' in respect with the distance.
#'
#' @param form Formula that describes the response value, sites and time variables.
#' Must have the form \code{value ~ site + time}.
#'
#' @param x Data. If not specify by a formula, the columns must be respectively :
#' value, site and time.
#'
#' @param distance Matrix of distances. Must be of the same order as the sites
#' appear in x.
#'
#' @param nmin Minimal number of paired observations for computing
#' the empirical correlation, otherwise no correlation is assumed.
#'
#' @param smooth Logical. Should the intersite correlation matrix be smoothed by
#' a power exponential model.
#'
#' @param smooth.p Exponent of the power exponential model.
#'
#' @param smooth.hmax Maximal distance used for fitting the power exponential model.
#'
#' @param nmom Number of L-moments to evaluate.
#'
#' @return
#'
#' \item{lmom}{Matrix of L-moments.}
#' \item{nrec}{Record lengths.}
#' \item{distance}{Distance matrix.}
#' \item{corr}{Intersite correlation matrix.}
#' \item{para}{Parameters of the power exponential model.}
#'
#' @export
#'
#' @examples
#'
#' h <- flowAtlantic$distance
#' a <- flowAtlantic$ams
#'
#' Intersite(ams ~ id + year, a)
#'
#' Intersite(ams ~ id + year, a, distance = h,
#'           smooth = TRUE, smooth.hmax = 500)
#'
############################################################################
Intersite <- function(x, ...) UseMethod('Intersite',x)

#' @export
#' @rdname Intersite
Intersite.data.frame <-function(
  x,
  distance = NULL,
  nmin = 20,
  smooth = FALSE,
  smooth.p = 1,
  smooth.hmax = Inf,
  nmom = 4)
{

  ## Convert data to wide format
  xmat <- DataWide(x)

  ##---------------------------------##
  ## Compute the correlation matrix
  ##---------------------------------##

  ## compute correlation matrix using spearman rho
  corr <- cor(xmat, use = 'pairwise.complete.obs', method = 'spearman')
  corr <- 2*sin(pi/6*corr)

  ## compute the length of each pair and remove the correlation coefficient
  ## that were evaluated with too few pairs
  nmat <- crossprod(!is.na(xmat))

  nid <- (nmat < nmin)
  if (any(nid))
    corr[nid] <- NA

  #impute zero to missing values
  nid <- is.na(corr)
  if (any(nid))
    corr[nid] <- 0

  ## force the diagonal to 1. Error could happen if nmin > nsite
  diag(corr) <- 1

  ##-----------------------------------------------------------##
  ## Fit a power exponential model on the pairwise correlation
  ##-----------------------------------------------------------##
  if (smooth){

    if (is.null(distance))
      stop('Distance must be specified for smoothing correlation')

    ## extract pairs
    pcor <- corr[lower.tri(corr)]
    ph <- distance[lower.tri(distance)]
    w <- nmat[lower.tri(nmat)]

    ## remove pairs too far apart
    pid <- (ph <= smooth.hmax)
    pcor <- pcor[pid]
    ph <- ph[pid]
    w <- w[pid]

    ## Define power exponential model
    Fpow <- function(p,h){
      ## ensure positiveness
      p[1] <- exp(p[1])/(1+exp(p[1]))
      p[2] <- exp(p[2])

      return(p[1] * exp(-3*( h / p[2])^smooth.p))
    }

    ## Define MSE of the correlation fit

    Fobj <- function(p,h) sum(w*(pcor - Fpow(p,h))^2)

    ## Optimize the MSE
    sol <- optim(c(0,log(mean(ph))),
                 Fobj, h = ph)

    ## Create the new smooth correlation matrix base on fitted model
    corr <- Fpow(sol$par,distance)
    diag(corr) <- 1

    ## Extract the parameters of the correlation model
    para <- c(1-exp(sol$par[1])/(1+exp(sol$par[1])),
              exp(sol$par[2]),
              smooth.p)

    names(para) <- c('nugget','range','smooth')


  } else {
    para <- NULL
  }

  ## Empirical correlation matrix
  ## Correct for positive definitness
  corr <- as.matrix(Matrix::nearPD(corr, corr = TRUE)$mat)

  ## Compute the record lengths
  nrec <- apply(xmat,2,function(z) length(na.omit(z)))

  ## There should be at least 4 L-moments
  nmom <- max(2,nmom)

  ## Compute the Lmoments
  lmom <- t(apply(xmat, 2, function(z) lmom::samlmu(na.omit(z),nmom)))

  colnames(lmom) <- c('L1','LCV','LSK','LKUR','TAU5','TAU6')[1:nmom]
  lmom[,2] <- lmom[,2]/lmom[,1]

  ## Return results
  ans <- list(lmom = lmom,
              nrec = nrec,
              distance = distance,
              corr = corr,
              para = para)


  class(ans) <- 'intersite'

  return(ans)
}


#' @export
#' @rdname Intersite
Intersite.formula <- function(form, x, ...){
  x <- model.frame(form, as.data.frame(x))
  Intersite(x, ...)
}

#' @export
Intersite.matrix <- function(x, ...){
  Intersite(as.data.frame(x), ...)
}

#' @export
print.intersite <- function(obj)
{
  ## number of site
  n <- nrow(obj$corr)
  pcor <- obj$corr[lower.tri(obj$corr)]

  cat('\nIntersite correlation\n')
  cat('\nNumber of site: ',n,'\n')

  ## Correlation model
  if (all(obj$para == -1)){
    cat('Model: emp\n')
    cat('Average:', round(mean(pcor),3))
    mx <- pcor[which.max(abs(pcor))]
    cat('\nMax (abs):', round(mx,3))

  } else{
    cat('Model: exp\nParameters:\n')
    print(round(obj$para,3))
  }

}


####################
#' Transform streamflow data to wide format
#'
#'  Return a matrix where every columns correspond to a specific site.
#'  Missing value are added.
#'
#' @param x Data in long format.
#'
#' @param form Formula that specify the variable.
#' Must have the form: `value ~ site + time`.
#'
#'
#' @export
#'
#' @examples
#'
#' DataWide(flowAtlantic$ams, ams ~ id + year)
#'
DataWide <- function(x, form = NULL){

  ## Force data to be a data.frame
  x <- as.data.frame(x)

  ## If a formula is passed, extract the variable
  if(!is.null(form))
    x <- model.frame(form,x)

  ## Verify that there is no missing value in the data
  x <- na.omit(x)

  ## verify that there is no duplicate
  if(max(table(x[,2],x[,3]))>1)
    stop('There is some duplicates')

  ##---------------------------------##
  ## reshape data in matrix wide format
  ##---------------------------------##

  ## Extract level for the site and time id
  site.name <- unique(as.character(x[,2]))
  time.name <- unique(as.character(x[,3]))
  nsite <- length(site.name)

  if(nsite < 2)
    stop('Must have more than one site')

  ## Transform site and time id in integer
  ## note that factor sort the label
  x[,2] <- as.integer(factor(as.character(x[,2])))
  x[,3] <- as.integer(factor(as.character(x[,3])))

  ## split the data in list
  xlst <- split(x, x[,2])
  nc <- length(xlst)
  nr <- max(x[,3])

  ## Sort site in the original order
  site.order <- unique(x[,2])
  xlst <- xlst[site.order]

  ## allocate memory
  xmat <- matrix(NA, nr, nc)

  ## Affect the value to the wide format
  for(jj in 1:nc)
    xmat[xlst[[jj]][,3],jj] <- xlst[[jj]][,1]

  colnames(xmat) <- site.name
  rownames(xmat) <- sort(time.name)

  return(xmat)
}