R/roiGls.R

#######################################################################
#' Perform frequency analysis of a target site
#' using Region of influence (ROI) and Generalized least squares (GLS)
#'
#' Returns the fitting of the regression models at a target
#' site with the best calibration settings.
#'
#' @param form Formula or list of formulas to try.
#'
#' @param x Data frame containing all variables in the formulas.
#'
#' @param x0 Data.frame or list containing the input variables of
#'   the target
#'
#' @param nk Scalar or vector providing a series of neighborhood sizes
#'   to try.
#'
#' @param S Sampling covariance matrix for the output variable.
#'
#' @param distance Vector of distance between the donors and target
#'
#' @param criteria Criteria to select the best calibration. One of
#'   'gcv','vp0','avp', 'avpo'.
#'
#'
#' @param lambda Scalar that modified the penalty in the GCV criteria
#'
#' @param sigTol Lower limit for the model variance during the clalibration
#'
#' @param verbose If true a progress bar is printed.
#'
#' @details
#'
#' The GLS model was  proposed by Stedinger and Takser (1985):
#' \deqn{y= X \beta + \eta + \epsilon}
#' where \eqn{\eta} and \eqn{\epsilon} are two independent terms of errors.
#' The total error is \eqn{\epsilon = \eta + \delta} and includes
#' respectively the error due to sampling, with known covariance
#' matrix \eqn{S}, and a iid term of error \eqn{\eta} with variance
#' \eqn{\sigma^2}.
#' Therefore, the covariance matrix of the total error is then given by
#' \deqn{ \Lambda = \sigma^2 I + S = \sigma^2 G.}
#'
#' For a given model error \eqn{\sigma^2}, the cholesky decomposition provide
#' the matrix decomposition \eqn{G^{-1} = UU'}.
#' Solving the initial GLS model with known model variance is then equivant
#' to solving the ordinary least-squares problem
#' \deqn{U'y = UX + \epsilon^\ast.}
#' A new estimation of the model error can be obtained by
#' \deqn{\hat\sigma^2 = (n-p)^{-1} \sum_{i =1 }^n \epsilon_i^\ast}
#' where \eqn{n} is the number of sites and \eqn{p} the number of
#' parameters.
#' Subsequent iterations are then performed to improved the global
#' fitting of the GLS model until \eqn{\sigma^2}
#' has converged.
#'
#'
#' @section References:
#'
#' Stedinger, J. R., & Tasker, G. D. (1985). Regional Hydrologic Analysis:
#'   1. Ordinary, Weighted, and Generalized Least Squares Compared.
#'   Water Resources Research, 21(9), 1421–1432.
#'   https://doi.org/10.1029/WR021i009p01421
#'
#' Reis, D. S., Stedinger, J. R., & Martins, E. S. (2005). Bayesian
#'   generalized least squares regression with application to log
#'   Pearson type 3 regional skew estimation. Water Resources Research,
#'   41(10), W10419. https://doi.org/10.1029/2004WR003445
#'
#' Kjeldsen, T. R., & Jones, D. A. (2009). An exploratory analysis of
#'   error components in hydrological regression modeling. Water Resources
#'   Research, 45(2), W02407. https://doi.org/10.1029/2007WR006283

#'
#' @export
#'
#' @examples
#' xd <- ungaugedDemoData()
#'
#' lform <- list(y ~ x,
#'               y ~ x + I(x^2))
#'
#' fit <- roiGls( lform, xd$data[-1,], xd$data[1,],
#'               S = xd$S[-1,-1],
#'               distance = xd$distance[1,-1],
#'               nk = seq(20,200,10))
#'
#' # Show best calibration settings
#' print(fit)
#'
#' # Summary of the model
#' summary(fit)
#' plot(fit)

roiGls <- function(form, ...) UseMethod('roiGls', form)

#' @export
#' @rdname roiGls
roiGls.formula <- function(form, x, x0, nk, S = NULL, distance = NULL,
                        criteria = 'vp0', lambda = 1, sigTol = 0,
                        verbose = FALSE){

  ## Extract data from formula
  x <- model.frame(form,x)
  xm <- model.matrix(form,x)

  tt <- delete.response(terms(form))
  xm0 <- as.numeric(model.matrix(tt, x0))
  x0 <- model.frame(tt, x0)
  names(x0) <- names(x)[-1]

  nk <- sort(nk)
  if(max(nk)> nrow(x))
    stop('Some sizes are larger than the population')

  ## Default covariance is identity
  if(is.null(S)) S <- diag(nrow(x))
  else if( nrow(S) != nrow(x)) stop('Wrong size for S')

  ## Compute distance between output variables (if necessary)
  if(is.null(distance)){
    distance <- apply(x[,-1], 1, function(x1,x2){
      sqrt(sum((x1-x2)^2))}, x2 = as.numeric(x0))

  } else if(length(distance) != nrow(x)){
    stop('Wrong size for distance')
  }

  nid <- order(distance)

  ## Create storage
  y0 <- vp0 <- avpo <- avp <-
        gcv <- sig2 <- matrix(NA,1,length(nk))

  ## Set monitoring bar
  if(verbose) bar <- txtProgressBar(style = 3)

  ## Loop among ROI sizes
  for(ii in 1:length(nk)){

    sid <- nid[1:nk[ii]]

    ## Do not use a neigborhood with Infinite or NA distance
    if(sum(!is.finite(distance[sid])) > 0) next

    ## Fit the GLS model
    mdl <- lmFlood(form, x[sid,], S = S[sid,sid])

    ## Compute the criteria
    y0[ii] <- sum(xm0 * coef(mdl))
    sig2[ii] <- mdl$sig2
    vp0[ii]  <- mdl$sig2 +
                  crossprod(xm0, vcov(mdl,'beta') %*% xm0)

    A <- tcrossprod(xm[sid,] %*% vcov(mdl,'beta'), xm[sid,] )
    avp[ii] <- mdl$sig2 + mean(diag(A))
    avpo[ii] <- avp[ii] - 2 * mdl$sig2 * mdl$rank/nk[ii]

    gcv[ii] <- (nk[ii]/(nk[ii]- lambda * mdl$rank)^2 *
                                       sum(residuals(mdl,'b')^2))

    ## Select a criteria
    if(criteria == 'gcv')
      crit = gcv[ii]
    else if(criteria == 'avp')
      crit = avp[ii]
    else if(criteria == 'avpo')
      crit = avpo[ii]
    else if(criteria == 'vp0')
      crit = vp0[ii]


    ## Keep the best model information
    if(ii == 1){
      bestMdl <- mdl
      bestNeighbor <- sid
      bestii <- ii
      bestCrit <- crit
    } else if(bestMdl$sig2 < sigTol & mdl$sig2 > sigTol){
      bestMdl <- mdl
      bestNeighbor <- sid
      bestii <- ii
      bestCrit <- crit
    } else if(crit < bestCrit & mdl$sig2 > sigTol){
      bestMdl <- mdl
      bestNeighbor <- sid
      bestii <- ii
      bestCrit <- crit
    }

    ## Monitoring
    if(verbose) setTxtProgressBar(bar,ii/length(nk))

  }

  ans <- list(data = x, distance = distance, S = S,
              mdl = bestMdl, ii = bestii, lambda = lambda,
              critValue = bestCrit, criteria = criteria,
              donor = bestNeighbor,
              iiform = 1, form = list(form), sigTol = sigTol,
              y0 = y0, x0 = x0, nk = nk, sig2 = sig2,
              gcv = gcv, vp0 = vp0, avp = avp, avpo = avpo)

  class(ans) <- 'roiGls'
  return(ans)

}

#' @export
#' @rdname roiGls
roiGls.list <- function(form, x, x0, ...){

  ## Create storage
  y0 <- sig2 <- gcv <- avp <- vp0 <- avpo <- list()

  ## Loop through formulas
  for(ii in 1:length(form)){
      ## Fit a ROI-GLS
      tmp <- roiGls(form[[ii]], x, x0, ...)

      ## Extract criteria
      y0[[ii]] <- tmp$y0
      sig2[[ii]] <- tmp$sig2
      gcv[[ii]] <- tmp$gcv
      avp[[ii]] <- tmp$avp
      avpo[[ii]] <- tmp$avpo
      vp0[[ii]] <- tmp$vp0

      ##update the best model
      if(ii == 1){
        ans <- tmp
        iiform <- ii
      }
      else if(tmp$critValue < ans$critValue) {
        ans <- tmp
        iiform <- ii
      }
  }

  ## Keep info on best model
  ans$form <- form
  ans$iiform <- iiform
  ans$data <- x
  ans$x0 <- x0

  ## Reshape the list of criteria in matrix
  nsize <- length(sig2[[1]])

  if(nsize > 1){
    ans$y0   <- matrix(unlist(y0),   ncol = nsize, byrow = TRUE)
    ans$sig2 <- matrix(unlist(sig2), ncol = nsize, byrow = TRUE)
    ans$gcv  <- matrix(unlist(gcv),  ncol = nsize, byrow = TRUE)
    ans$avp  <- matrix(unlist(avp),  ncol = nsize, byrow = TRUE)
    ans$vp0  <- matrix(unlist(vp0),  ncol = nsize, byrow = TRUE)
    ans$avpo <- matrix(unlist(avpo), ncol = nsize, byrow = TRUE)
  }

  return(ans)
}

#################################################################
#' Update a ROI-GLS model
#'
#' Refit the ROI-GLS model with best calibration after changes.
#' Try to avoid to unnesssary fitting to identify the new best calibration
#' settings.
#'
#' @param obj A ROI-GLS model. Normally an output of function \code{roiGls}
#'
#' @param selectNk Neighborhood size to use. Must be in the list
#'  of candidate sizes.
#'
#' @param selectForm Formula to use. Must be in the list of candidate formulas.

#' @param newCriteria Criteria to select the best calibration settings.
#'
#' @param newLambda New parameter \code{lambda} used for the GCV criteria.
#'
#' @param newSigTol New parameter \code{tol} for the model variance.
#'
#' @param newNk Vector of new neighborhood sizes to try.
#'
#' #' @param newForm List of New formulas to try.
#'
#' @param rmNk A ROI size to remove from the candidate sizes
#'
#' @param rmForm A formula to remove from the candidate formula
#'
#' @export
roiUpdate <- function(obj,...) UseMethod('roiUpdate',obj)

#' @export
#' @rdname roiUpdate
roiUpdate.list <- function(obj, ...){

  bar <- txtProgressBar(style = 3)
  setTxtProgressBar(bar,0)

  for(kk in 1:length(obj)){
    obj[[kk]] <- roiUpdate(obj[[kk]], ...)
    setTxtProgressBar(bar,kk/length(obj))
  }

  return(obj)
}


#' @export
roiUpdate.roiGls <- function(obj, selectNk = NULL, selectForm = NULL,
                      newCriteria = NULL, newLambda = NULL, newSigTol = NULL,
                      newNk = NULL, newForm = NULL,
                      rmNk = NULL, rmForm = NULL)
{

  ## Update GCV with new lambda parameter
  if(!is.null(newLambda)){
    np <- sapply(obj$form, function(z) ncol(model.matrix(z,obj$data)))

    for(jj in 1:nrow(obj$gcv)){
      for(kk in 1:ncol(obj$gcv)){
        obj$gcv[jj,kk] <- (obj$gcv[jj,kk] *
                            (obj$nk[kk] - obj$lambda * np[jj])^2 /
                            (obj$nk[kk] - newLambda * np[jj])^2)

      }
    }

    obj$lambda <- newLambda

  }#endif

  ## remove sizes form list
  if(!is.null(rmNk)){
    rmNkId <- which(rmNk == obj$nk)

    obj$nk   <- obj$nk[-rmNkId]
    obj$y0   <- obj$y0[,-rmNkId]
    obj$sig2 <- obj$sig2[,-rmNkId]
    obj$gcv  <- obj$gcv[,-rmNkId]
    obj$avp  <- obj$avp[,-rmNkId]
    obj$vp0  <- obj$vp0[,-rmNkId]
    obj$avpo <- obj$avpo[,-rmNkId]

  }

  ## remove formulas from list
  if(!is.null(rmForm)){
    rmFormId <- which(sapply(obj$form, function(f) rmForm == f))

    obj$form <- obj$form[-rmFormId]
    obj$y0   <- obj$y0[-rmFormId,]
    obj$sig2 <- obj$sig2[-rmFormId,]
    obj$gcv  <- obj$gcv[-rmFormId,]
    obj$avp  <- obj$avp[-rmFormId,]
    obj$vp0  <- obj$vp0[-rmFormId,]
    obj$avpo <- obj$avpo[-rmFormId,]

  }

  ## Add new sizes to the list
  if(!is.null(newNk)){

    ## Upgrade the storage for the selection criteria
    obj$nk <- c(newNk,obj$nk)
    newBlock <- matrix(NA,nrow(obj$sig2),length(newNk))
    obj$y0   <- cbind(newBlock,obj$y0)
    obj$sig2 <- cbind(newBlock,obj$sig2)
    obj$gcv  <- cbind(newBlock,obj$gcv)
    obj$avp  <- cbind(newBlock,obj$avp)
    obj$vp0  <- cbind(newBlock,obj$vp0)
    obj$avpo <- cbind(newBlock,obj$avpo)

  }

  ## Add Formulas to the list
  if(!is.null(newForm)){

    if(!is.list(newForm)) newForm <- list(newForm)

    obj$form <- append(newForm,obj$form)
    newBlock <- matrix(NA,length(newForm),ncol(obj$sig2))
    obj$y0   <- rbind(newBlock,obj$y0)
    obj$sig2 <- rbind(newBlock,obj$sig2)
    obj$gcv  <- rbind(newBlock,obj$gcv)
    obj$avp  <- rbind(newBlock,obj$avp)
    obj$vp0  <- rbind(newBlock,obj$vp0)
    obj$avpo <- rbind(newBlock,obj$avpo)
  }

  ## Fit models with new sizes and formulas
  for(jj in 1:nrow(obj$sig2)){
    for(kk in 1:ncol(obj$sig2)){

      if(is.na(obj$sig2[jj,kk])){
        new <- roiGls(obj$form[[jj]], x = obj$data, x0 = obj$x0,
                   nk = obj$nk[kk], sigTol = obj$sigTol,
                   distance = obj$distance, S = obj$S, verbose = FALSE,
                   criteria = obj$criteria, lambda = obj$lambda)

        obj$y0[jj,kk]    <- new$y0[1]
        obj$sig2[jj,kk] <- new$sig2[1]
        obj$gcv[jj,kk]  <- new$gcv[1]
        obj$avp[jj,kk]  <- new$avp[1]
        obj$vp0[jj,kk]  <- new$vp0[1]
        obj$avpo[jj,kk] <- new$avpo[1]

      }# endif
    }
  }#endfor


  ## Extract the selection criteria
  if(!is.null(newCriteria)) obj$criteria <- newCriteria

  if(obj$criteria == 'gcv')
      crit = obj$gcv
  else if(obj$criteria == 'avp')
    crit = obj$avp
  else if(obj$criteria == 'vp0')
    crit = obj$vp0
  else if(obj$criteria == 'avpo')
    crit = obj$avpo

  ## Avoid selecting a model with a near zero variance
  if(!is.null(newSigTol)) obj$sigTol <- newSigTol

  crit[obj$sig2 < obj$sigTol] <- Inf

  ## identify the new calibration setting
  if(is.null(selectNk) & is.null(selectForm)){
    ## optimal
    obj$iiform <- which.min(apply(crit,1,min))
    obj$ii     <- which.min(crit[obj$iiform,])

  } else if(!is.null(selectNk) & !is.null(selectForm)){
    ## Both size and formula known
    obj$iiform <- which(sapply(obj$form, function(f) selectForm == f))
    obj$ii     <- which(selectNk == obj$nk)

  } else if(!is.null(selectNk)){
    ## Known ROI size
    obj$ii     <- which(selectNk == obj$nk)
    obj$iiform <- which.min(crit[,obj$ii])

  } else{
    ## Known formula
    obj$iiform <- which(sapply(obj$form, function(f) selectForm == f))
    obj$ii     <- which.min(crit[obj$iiform,])
  }

  ## refit the GLS model
  new <- roiGls(obj$form[[obj$iiform]], x = obj$data, x0 = obj$x0,
          nk = obj$nk[obj$ii], sigTol = obj$sigTol,
          distance = obj$distance, S = obj$S, verbose = FALSE,
          criteria = obj$criteria, lambda = obj$lambda)

  ## update the best model components
  obj$mdl       <- new$mdl
  obj$critValue <- new$critValue
  obj$donor        <- new$donor


  return(obj)
}


########################################################
#' Plot and summary diagnostic of ROI-GLS calibration
#'
#' The function \code{roiPlot} draw the prediction variance and
#' the GCV of the calibrated ROI-GLS model in respect of the number
#' of sites in the provided list \code{nk}.
#' Also, most of the function use to analyse the output of a \code{lm}
#' model avaible to analyse the GLS model in its OLS form.
#' See \code{\link{roiGLS}}.
#'
#' @param obj A ROI-GLS model. Normally output of function \code{roiGls}
#'
#' @param iiform Indices of the formula to plot. By default the
#'   formula of the best calibration.
#'
#' @param ... Other parameter for the generic plot function
#'
#' @export
#'
#' @examples
#'
#' xd <- ungaugedDemoData()
#'
#' fit <- roiGls(y ~ x,
#'               xd$data[-1,], xd$data[1,],  S = xd$S[-1,-1],
#'               distance = xd$distance[1,-1],
#'               nk = seq(20,100,10))
#'
#' roiPlot(fit, type = 'l')
#'
#' print(fit)
#'
#' summary(fit)
#'
#' coef(fit)
#'
#' residuals(fit)

roiPlot <- function(obj, iiform = NULL, xlab = 'Number of sites',
                    ylab = 'Model error',
                    ylim = NULL, pos = 'topright',
                    type = NULL, ...){

  if(is.null(iiform)) iiform <- obj$iiform

  if(is.matrix(obj$gcv)){
    obj$gcv <- obj$gcv[iiform,]
    obj$vp0 <- obj$vp0[iiform,]
  }

  # find bounds for the graphic
  if(is.null(ylim)){
    yall <- c(obj$gcv, obj$vp0)
    ylim <- c(0.95 * min(yall), 1.05 * max(yall))
  }

  ## Plot the graphic
  plot(obj$nk,obj$vp0, ylim = ylim, xlab = xlab, ylab = ylab ,
       type = type, ...)
  points(obj$nk,obj$gcv, col = 2, type = type)
  abline(v = obj$nk[obj$ii], lty = 2, col = 3)
  legend(pos, col = c(1,2), lty = rep(1,3),
         legend = c('VP0', 'GCV'))

}

#' @export
#' @rdname roiPlot
summary.roiGls <- function(obj) {

  print(summary(obj$mdl))

  cat('\nSummary of the best ROI size\n')

  ## Extract the selection criteria
  if(obj$criteria == 'gcv')
    crit = obj$gcv
  else if(obj$criteria == 'avp')
    crit = obj$avp
  else if(obj$criteria == 'vp0')
    crit = obj$vp0
  else if(obj$criteria == 'avpo')
    crit = obj$avpo

  oid <- order(crit[obj$iiform,])

  if(length(oid) > 5) oid <- oid[1:5]


  print(data.frame( Size = obj$nk[oid],
                    SD0  = sqrt(obj$vp0[obj$iiform,oid]),
                    GCV  = obj$gcv[obj$iiform,oid],
                    AVP  = obj$avp[obj$iiform,oid],
                    AVPO  = obj$avpo[obj$iiform,oid]))

  cat('\n Nearest sites\n')
  print(obj$donor)

  cat('\n Summary of the best formulas\n')
  nid <- t(apply(crit,1,order))
  fid <- order(apply(crit,1,min))

  if(length(fid) > 5) fid <- fid[1:5]

  nid <- nid[fid,1]

  print(obj$form[fid])
  if(length(fid) == 1){
    print(data.frame( Size = obj$nk[nid],
                    SD0  = (sqrt(obj$vp0[fid,nid])),
                    GCV  = (obj$gcv[fid,nid]),
                    AVP  = (obj$avp[fid,nid]),
                    AVPO = (obj$avpo[fid,nid])))

  } else{
    print(data.frame( Size = obj$nk[nid],
                    SD0  = diag(sqrt(obj$vp0[fid,nid])),
                    GCV  = diag(obj$gcv[fid,nid]),
                    AVP  = diag(obj$avp[fid,nid]),
                    AVPO = diag(obj$avpo[fid,nid])))
  }
 }



##################################################################
#' Linear model with observational errors
#'
#' @export
#' @examples
#'
#' nn <- 1000
#' xx <- data.frame( x1 = runif(nn), x2 = runif(nn)) * 2
#'
#' S <- exp(-as.matrix(dist(xx)))
#'
#' sres <- mnormt::rmnorm(1, varcov = S)
#'
#' y <- 5 -2 * xx[,1] + 3 * xx[,2] + rnorm(nn) + sres
#'
#'  fit <- lmFlood(y ~ 1+ x1 + x2, xx, S)
#'
#' summary(fit)

lmFlood <- function(form, x, S ,  tol = 1e-8){

  ## extract info from formula
  x <- model.frame(form,x)
  xm <- model.matrix(form,x)

  n <- nrow(x)
  vnames <- colnames(xm)

  ## Solve initial solution
  fit <- lm(x[,1] ~ xm -1)
  nedf <- n-fit$rank

  ## Fit an initial OLS model
  sig2 <- sum(residuals(fit)^2)/nedf

  sig2Old <- Inf
  B <- NULL

  ## Iteration until model variance converge
  while(abs(sig2-sig2Old) > tol & sig2 > tol){

    B <- chol(solve(diag(nrow(S)) + S/sig2))
    By <- as.numeric(B %*% x[,1])
    Bx <- B %*% xm

    ## Fit OLS model
    fit <- lm(By ~ Bx - 1)

    ## Update model variance
    sig2Old <- sig2
    sig2 <- sum(residuals(fit)^2)/nedf

  }

  ## Keep model info
  ans <- list(model = x,
              formula = form,
              rank = fit$rank,
              df.residuals = nedf,
              coefficients = coef(fit),
              fitted.values = xm %*% coef(fit),
              B = B,
              qr = fit$qr,
              residuals.b = residuals(fit),
              fitted.b = fitted(fit),
              sig2 = sig2,
              S = S,
              vcov = vcov(fit),
              llik = as.numeric(logLik(fit)),
              dev = deviance(fit))

  ans$residuals <- x[,1] - ans$fitted.values
  names(ans$coefficients) <- vnames
  rownames(ans$vcov) <- colnames(ans$vcov) <- vnames

  class(ans) <- 'lmFlood'
  return(ans)
}

#' @export
fitted.lmFlood <- function(obj, type = 'working'){
  if(type == 'working') ans <- obj$fitted.values
  else if(type == 'b') ans <- obj$fitted.b
  return(as.vector(ans))
}

#' @export
residuals.lmFlood <- function(obj, type = 'working'){
  if(type == 'working') ans <- obj$residuals
  else if(type == 'b') ans <- obj$residuals.b
  else if(type == 'sb') ans <- obj$residuals.b/sqrt(obj$sig2)
  else if(type == 'pearson'){
    ans <- obj$residuals/sqrt(obj$sig2+ diag(obj$S))
  }

  return(as.vector(ans))
}

#' @export
vcov.lmFlood <- function(obj, type = 'total'){
  if(type == 'total')         ans <- obj$sig2 * diag(nrow(obj$S)) + obj$S
  else if(type == 'sampling') ans <- obj$S
  else if(type == 'model')    ans <- obj$sig2 * diag(nrow(obj$S))
  else if(type == 'beta')     ans <- obj$vcov

  return(ans)
}

predict.lmFlood <- function(obj, newData = NULL){
  if(is.null(newData)){
    ans <- fitted(obj,'working')

  } else if(is.matrix(newData)) {
    ans <- newData %*% coef(obj)

  } else{
    tt <- delete.response(terms(obj$form))
    ans <- model.matrix(tt, as.list(newData)) %*% coef(obj)
  }

  return(as.vector(ans))
}


#' @export
print.lmFlood <- function(obj){
  cat('\nGeneralized least squares fit of the flood model')
  cat('\n\nModel:\n')
  print(obj$formula)

  n <- nrow(obj$model)
  cat('\nStandard model error:', format(sqrt(obj$sig2), digits = 4))
  cat('\nObs: ', n, ', DoF: ', n-obj$rank, '\n\n', sep = '')

}

#' @export
summary.lmFlood <- function(obj){

  n <- nrow(obj$model)
  p <- obj$rank
  res <- summary(residuals(obj,'pearson'))

  dev <- 1 - obj$dev /deviance(lm(I(obj$B %*% obj$model[,1]) ~ 1))
  nhs <- 1 - var(residuals(obj))/var(obj$model[,1])

  llik <- data.frame( AIC = 2 * (p - obj$llik) ,
                      BIC = log(n) * p -  2*obj$llik,
                      llik = obj$llik,
                      dev.Expl. = dev,
                      Nash = nhs)

  beta <- coef(obj)
  std <- sqrt(diag(obj$vcov))
  tval <- beta / std
  pval <- 1 - pt(abs(tval), nrow(obj$model) - obj$rank)

  corr <- cov2cor(obj$vcov)

  coef <- data.frame(Estimate = beta,
                     Std.Error = std,
                     t.value = tval,
                     p.value = pval)

  shapiro <- shapiro.test(residuals(obj,'pearson'))


  ans <- list(n = n, p = p,
              res = res,
              llik = llik,
              coef = coef,
              sig2 = obj$sig2,
              corr = corr,
              formula = obj$formula,
              shapiro = shapiro)

  class(ans) <- 'summary.lmFlood'
  return(ans)

}

#' @export
print.summary.lmFlood <- function(obj){

  cat('\nGeneralized least squares fit of the flood model')
  cat('\n\nModel:\n')
  print(obj$formula)
  cat('\n')
  print(obj$llik, digits = 4, row.names = FALSE)

  cat('\n\nCoefficients:\n')
  print(obj$coef, digits = 4)

  cat('\n\nCorrelation:\n')
  tmp <- format(round(obj$corr,2))
  tmp[upper.tri(tmp)] <- ""
  diag(tmp) <- ""
  print(tmp[-1,-obj$p], quote = FALSE)

  cat('\n\nResiduals:\n')
  print(obj$res)

  cat('\nStandard model error:', format(sqrt(obj$sig2), digits = 4))
  cat('\nObs: ', obj$n, ', DoF: ', obj$n-obj$p, '\n\n', sep = '')

  print(obj$shapiro)

}

#' @export
qqplot <- function(x, ...) UseMethod('qqplot', x)

#' @export
qqplot.default <- function(x) stats::qqplot(x, ...)

#' @export
qqplot.lmFlood <- function(x){

  n <- length(x$residuals)
  emp <- sort(scale(residuals(x)))
  theo <- qnorm((1:n)/(n+1))

  plot(theo, emp, ylab = 'Emprical quantiles', xlab = 'Theoritical quantiles')
  abline(0,1, lty = 3)
}

#' @export
qqplot.roiGls <- function(x) qqplot(x$mdl)

#' @export
AIC.lmFlood <- function(obj, k = 2) k*p - 2 * obj$llik

#' @export
logLik.lmFlood <- function(obj) obj$llik

#' @export
plot.lmFlood <- function(obj , type = 'working',
                         ylab = 'Residuals', xlab = 'Fitted',
                         ...) {
  if(type %in% c('working','pearson')){
    plot(fitted(obj,'working'), residuals(obj,'pearson'),
         xlab = xlab, ylab = ylab,...)
    abline(h=0, lty = 3)

  } else {
    plot(fitted(obj,'b'), residuals(obj,'sb'),
         xlab = xlab, ylab = ylab,...)
    abline(h=0, lty = 3)

  }

}

#' @export
termplot.lmFlood <- function(obj, vid = 1, alpha = .05, ci = TRUE,
                             ylab = 'Partial residuals', xlab = NULL,
                             col = 'black', ...){

  ## Extract and sort the selected variable
  xm <- model.matrix(obj$formula, obj$model)
  oid <- order(xm[,vid])
  x <- xm[oid,vid]

  vname <- colnames(xm)[vid]

  ## Compute partial residuals and fitted values
  beta <- coef(obj)[vid]
  err <- residuals(obj)[oid]
  hat <- beta * x

  ## Plot the results
  if(is.null(xlab)) xlab <- vname

  plot(x, hat + err, xlab = xlab, ylab = ylab, col = col)
  lines(x, hat, col = col)

}

################################################################
#' Utility functions for roiGls
#'
#' Provide a link with the \code{\link{lmFlood}} model
#'
#' @export
plot.roiGls <- function(x, ...) plot(x$mdl,...)

#' @export
termplot <- function(model, ...) UseMethod('termplot',model)

#' @export
termplot.default <- function(model, ...) stats::termplot(model, ...)

#' @export
termplot.roiGls <- function(obj,...) termplot(obj$mdl, ...)

#' @export
coef.roiGls <- function(obj) coef(obj$mdl)

#' @export
residuals.roiGls <- function(obj,...) residuals(obj$mdl,...)

#' @export
fitted.roiGls <- function(obj,...) fitted(obj$mdl,...)

#' @export
AIC.roiGls <- function(obj,...) AIC(obj$mdl,...)

#' @export
predict.roiGls <- function(obj,...) predict(obj$mdl,...)

#' @export
confint.roiGls <- function(obj,...) confint(obj$mdl)

#' @export
vcov.roiGls <- function(obj,...) vcov(obj$mdl,...)


#' @export
print.roiGls <- function(obj, ...){
  cat('\nROI model with', obj$nk[obj$ii], 'sites \n')
  print(obj$form[[obj$iiform]])
  cat('\nCenter at\n')
  tt <- delete.response(terms(obj$form[[obj$iiform]]))
  print(as.numeric(model.frame(tt,obj$x0)), digits = 4)
}

###############################################################
#' Performing ridge regression in GLS flood framework
#'

#' @export
ridge2Edf <- function(z, r)  sum(z/(z+r))

#' @export
edf2Ridge <- function(z, edf){

  ## Fin and initial interval
  l1 <- length(z); l0 <- 0
  while(ridge2Edf(z,l1) > edf ){
    l0 <- l1
    l1 <- 2*l1
  }

  return(uniroot(function(r) edf - ridge2Edf(z,r), c(l0,l1) )$root)
}

#' @export
ridgeCoef <- function(beta, z, ridge) beta * z/(z+ridge)


#' @export
ridgeFlood <- function(form, x, S , criteria = 'gcv', ridge.val = NULL,
                       ridge.lim = NULL, edf.lim = NULL, ridge.n = 100,
                       tol = 1e-8){

  ## extract info from formula
  x <- model.frame(form,x)
  xm <- model.matrix(form,x)

  n <- nrow(x)
  p <- ncol(xm)
  nedf <- n-p

  ybar <- mean(x[,1])
  y <- x[,1] - ybar

  ## Fit an initial OLS model
  d <- svd(xm)
  beta <- d$v %*% diag(1/d$d) %*% t(d$u) %*% x[,1]
  w <- as.numeric(y - xm %*% beta)

  sig2 <- sum(w*w)/nedf

  sig2Old <- Inf
  U <- NULL

  ## Loop throught iterative least-squares
  while(abs(sig2-sig2Old) > tol & sig2 > tol){

    U <- chol(solve(diag(nrow(S)) + S/sig2))
    Uy <- as.numeric(U %*% y)
    Ux <- U %*% xm

    ## Fit OLS model
    d <- svd(Ux)
    beta <- d$v %*% diag(1/d$d) %*% t(d$u) %*% Uy
    w <- as.numeric(Uy - Ux %*% beta)

    ## Update model variance
    sig2Old <- sig2
    sig2 <- sum(w*w)/nedf

  }

  ## Determine a list of ridge coefficients to evaluate
  dsq <- d$d * d$d

  if(criteria == 'hkb'){
    ridge <- p * sig2 / sum(beta^2)
  }
  else if(criteria == 'gcv'){

    ## transform the edf in ridge coefficient
    if(!is.null(ridge.val)){
      ridge <- ridge.val
    } else if(!is.null(ridge.lim)){
      ridge <- seq(ridge.lim[1], ridge.lim[2], l = ridge.n)

    } else if(!is.null(edf.lim)){
      ridge <- seq(edf2Ridge(dsq, edf.lim[2]),
                   edf2Ridge(dsq, edf.lim[1]), l = ridge.n)
    } else{
      ridge <- seq(0, edf2Ridge(dsq,1), l = ridge.n)
    }

  }

  ## Seach for the best ridge coefficient
  tab <- matrix(NA, length(ridge), 3 + p)
  bid <- (1:p)+3
  colnames(tab) <- c('ridge','edf','gcv', colnames(xm))

  tab[,1] <- ridge

  for(ii in 1:nrow(tab)){
    beta0 <- ridgeCoef(beta, dsq, ridge[ii])
    res0 <- Uy - Ux %*% beta0
    edf0 <- ridge2Edf(dsq,ridge[ii])
    gcv0 <- n/(n-edf0)^2 * sum(res0^2)

    tab[ii,2] <- edf0
    tab[ii,3] <- gcv0
    tab[ii,bid] <- beta0
  }

  if(nrow(tab)== 1) oid <- 1
  else oid <- which.min(tab[,'gcv'])

  beta <- tab[oid,bid]
  fitted <- ybar + xm %*% beta
  resid <- x[,1] - fitted
  edf <- tab[oid,'edf']
  w <- as.numeric(Uy - Ux %*% beta)
  sig2 <- sum(w*w)/(n-edf)


  ans <- list(coefficients = beta,
              fitted.values = fitted,
              residuals = resid,
              df.residuals = n - edf,
              edf = edf,
              rank = p,
              model = x,
              ridge.table = tab,
              formula = form,
              svd = d,
              sig2 = sig2)

  class(ans) <- c('list','ridgeFlood')

  return(ans)
}

###############################################################
#' Managing list of roiGls object
#'
#' A list of \link{roiGLS} model output. The following functions
#' extract the common elements, the predicted values and the
#' coefficients.
#'
#' @param l List of roiGls output
#'
#' @param type String that provides the element to extract. One of
#' 'nk', 'sig2', 'gcv','avp','avpo','vp0' or 'formula.
#'
#' @param se Should the standard error be returned.
#'
#' @export
#' @examples
#'
#' xd <- ungaugedDemoData()
#'
#' mdls <- list()
#'
#' ## Perform leave-one-out cross validation
#' for(kk in 1:3){
#'   mdls[[kk]] <- roiGls( y ~ x,
#'               x = xd$data[-kk,], x0 = xd$data[kk,],
#'               S = xd$S[-kk,-kk], distance = xd$distance[kk,-kk],
#'               nk = seq(20,50,10))
#' }
#'
#' roiCoef(mdls)
#'
#' roiGet(mdls, 'vp0')

#' @export
roiGet  <- function(l, type){
  if( type == 'nk'){
    ans <- sapply(l, function(z) z$nk[z$ii])

  } else if(type == 'sig2'){
    ans <- sapply(l, function(z) z$mdl$sig2[z$iiform,z$ii])

  } else if(type == 'gcv'){
    ans <- sapply(l, function(z) z$gcv[z$iiform,z$ii])

  }else if(type == 'avp'){
    ans <- sapply(l, function(z)  z$avp[z$iiform,z$ii])

  } else if(type == 'vp0'){
    ans <- sapply(l, function(z)  z$vp0[z$iiform,z$ii])

  } else if(type == 'avpo'){
    ans <- sapply(l, function(z)  z$avpo[z$iiform,z$ii])

  }else if(type == 'formula'){
    ans <- sapply(l, function(z) z$form[[iiform]])

  }else if(type == 'edf'){
    ans <- sapply(l, function(z) z$mdl$edf)
  }

  return(ans)
}

#' @export
#' @rdname roiGet
roiPred    <- function(l, se = FALSE){
  ans <- sapply(l, function(z) z$y0[z$iiform,z$ii])

  if(se == FALSE) return(ans)

  spred <- sapply(l, function(z) sqrt(z$vp0[z$iiform,z$ii]))

  return(data.frame(yhat = ans, se = spred))
}

#' @export
list2mat <- function(l, empty = NA){
 lnames <- sapply(l, function(z) names(z))
 unames <- unique(unlist(lnames))

 ans <- matrix(empty,length(l),length(unames))
 colnames(ans) <- unames

 for(ii in 1:length(l)){
    ans[ii,names(l[[ii]])] <- as.numeric(l[[ii]])
 }
 return(ans)
}

#' @export
#' @rdname roiGet
roiCoef <- function(l){
  return(t(sapply(l, function(z) coef(z$mdl))))
}

######################################################################
#' Demo data for ungauged analysis
#'
#' Returns a list containing the at-site means and drainage area
#' at the logarithm scale; a matrix of distance between sites and
#' a matrix of sampling error assuming correlation from
#' exponential model. For illustration purpose only.
#'
#' @export
#'
ungaugedDemoData <- function(){

  ## Extract the site with descriptors
  info <- canadaFlood$info
  ams <- split(canadaFlood$ams$flow, canadaFlood$ams$id)
  avg <- sapply(ams, mean)

  ## Approximation of the standard error at logarithm scale (delta method)
  sigma <- sapply(ams,sd) / sqrt(sapply(ams,length))/ avg

  ## Compute geographical distance
  coord <- info[,c('lon','lat')]
  h <- geosphere::distm(coord,coord, geosphere::distHaversine)/1000

  ## Covariance matrix
  S <- diag(sigma) %*% corModel(c(900,.12), h) %*% diag(sigma)

  return(list(data = data.frame( y = log(avg), x = log(info$area)),
              S = S,
              distance = h))

}



###############################################################
#' Performing ridge regression in GLS flood framework
#'
#'  #Experimental
#'
#' @export
ridgeFlood <- function(form, x, S , criteria = 'gcv', ridge.val = NULL,
                       ridge.lim = NULL, edf.lim = NULL, ridge.n = 100,
                       tol = 1e-8){

  ## extract info from formula
  x <- model.frame(form,x)
  xm <- model.matrix(form,x)

  n <- nrow(x)
  p <- ncol(xm)
  nedf <- n-p

  ybar <- mean(x[,1])
  y <- x[,1] - ybar

  ## Fit an initial OLS model
  d <- svd(xm)
  beta <- d$v %*% diag(1/d$d) %*% t(d$u) %*% x[,1]
  w <- as.numeric(y - xm %*% beta)

  sig2 <- sum(w*w)/nedf

  sig2Old <- Inf
  U <- NULL

  ## Loop throught iterative least-squares
  while(abs(sig2-sig2Old) > tol & sig2 > tol){

    U <- chol(solve(diag(nrow(S)) + S/sig2))
    Uy <- as.numeric(U %*% y)
    Ux <- U %*% xm

    ## Fit OLS model
    d <- svd(Ux)
    beta <- d$v %*% diag(1/d$d) %*% t(d$u) %*% Uy
    w <- as.numeric(Uy - Ux %*% beta)

    ## Update model variance
    sig2Old <- sig2
    sig2 <- sum(w*w)/nedf

  }

  ## Determine a list of ridge coefficients to evaluate
  dsq <- d$d * d$d

  if(criteria == 'hkb'){
    ridge <- p * sig2 / sum(beta^2)
  }
  else if(criteria == 'gcv'){

    ## transform the edf in ridge coefficient
    if(!is.null(ridge.val)){
      ridge <- ridge.val
    } else if(!is.null(ridge.lim)){
      ridge <- seq(ridge.lim[1], ridge.lim[2], l = ridge.n)

    } else if(!is.null(edf.lim)){
      ridge <- seq(edf2Ridge(dsq, edf.lim[2]),
                   edf2Ridge(dsq, edf.lim[1]), l = ridge.n)
    } else{
      ridge <- seq(0, edf2Ridge(dsq,1), l = ridge.n)
    }

  }

  ## Seach for the best ridge coefficient
  tab <- matrix(NA, length(ridge), 3 + p)
  bid <- (1:p)+3
  colnames(tab) <- c('ridge','edf','gcv', colnames(xm))

  tab[,1] <- ridge

  for(ii in 1:nrow(tab)){
    beta0 <- ridgeCoef(beta, dsq, ridge[ii])
    res0 <- Uy - Ux %*% beta0
    edf0 <- ridge2Edf(dsq,ridge[ii])
    gcv0 <- n/(n-edf0)^2 * sum(res0^2)

    tab[ii,2] <- edf0
    tab[ii,3] <- gcv0
    tab[ii,bid] <- beta0
  }

  if(nrow(tab)== 1) oid <- 1
  else oid <- which.min(tab[,'gcv'])

  beta <- tab[oid,bid]
  fitted <- ybar + xm %*% beta
  resid <- x[,1] - fitted
  edf <- tab[oid,'edf']
  w <- as.numeric(Uy - Ux %*% beta)
  sig2 <- sum(w*w)/(n-edf)


  ans <- list(coefficients = beta,
              fitted.values = fitted,
              residuals = resid,
              df.residuals = n - edf,
              edf = edf,
              rank = p,
              model = x,
              ridge.table = tab,
              formula = form,
              svd = d,
              sig2 = sig2)

  class(ans) <- c('list','ridgeFlood')

  return(ans)
}


#' @export
#' @rdname ridgeFlood
ridge2Edf <- function(z, r)  sum(z/(z+r))

#' @export
#' @rdname ridgeFlood
edf2Ridge <- function(z, edf){

  ## Fin and initial interval
  l1 <- length(z); l0 <- 0
  while(ridge2Edf(z,l1) > edf ){
    l0 <- l1
    l1 <- 2*l1
  }

  return(uniroot(function(r) edf - ridge2Edf(z,r), c(l0,l1) )$root)
}

#' @export
#' @rdname ridgeFlood
ridgeCoef <- function(beta, z, ridge)
  beta * z/(z+ridge)
martindurocher/floodRFA documentation built on June 5, 2019, 8:44 p.m.