floodStat: FloodNet Regional Frequency Analysis

########################################################################
#' Prediction at ungauged sites using region of influence and kriging
#'
#' Return the prediction of a local regression model using region of
#' influence (ROI) where the the residuals are further predicted
#' by kriging.
#'
#' @param x Data for trainging the model.
#'
#' @param xnew Data at new locations (Validation set).
#'
#' @param k Number of sites in the neighborhoods.
#'
#' @param trend Formula defining the trend.
#'
#' @param similarity Formula defining covariates used
#'   for forming region of influence based on Euclidien distance.
#'
#' @param kriging Formula defining the spatial covariates .
#'   used for predicting residuals with spatial correlation.
#'
#' @param model Variogram model. See \code{\link[gstat]{vgm}}
#'
#' @param ker Should a kernel be used in the local regression model.
#'
#' @param ... More arguments to pass to kriging function \code{\link{autoKrige}}
#'
#' @return
#'
#' \item{pred}{Prediction at new sites.}
#' \item{pred.se}{Standard deviation at new sites.}
#' \item{trend}{Part of the prediction attributed to trend}
#' \item{trend.se}{Standard deviation associated with the trend}
#' \item{fitted}{Fitted values (training sites)}
#' \item{fitted.se}{Standard deviaton of the fitted values}
#' \item{vgm}{Sample variogram.}
#' \item{model}{Fitted variogram model.}
#'
#' @export
#'
#' @examples
#' attach(canFlood)
#'
#' ## Transform data if necessary
#' xdf <- with(canFlood,
#'   data.frame(y      = log(L1),
#'              area   = scale(log(BASIN_AREA)),
#'              wb     = scale(log(WB_AREA_PC)),
#'              stream = scale(log(STREAM_DEN)),
#'              map    = scale(log(PPTAVG_BAS)),
#'              lon    = ELON,
#'              lat    = ELAT))
#'
#'  # Note that the coordinates were transform to a Euclidean space using
#'  # multidimentional scaling
#'
#'  ## select a validation and training set
#'  vid <- runif(nrow(xdf)) > .8
#'  tid <- !vid
#'
#'  # formula of the relationship between flood quantile and descriptors
#'  ftrend <- y ~ area + map + wb + stream
#'  fsimilarity <- ~ area + map + wb + stream
#'
#'  ## Fit a local regression model
#'  fit <- FitRoi(x = xdf[tid,], xnew = xdf[vid,], k = 50,
#'                ftrend, similarity = fsimilarity)
#'  print(fit)
#'  sd(xdf[vid,'y'] - fit$pred)
#'
#'  ## Refit the model and perform the kriging of the residuals
#'  fitk <- FitRoi(x = xdf[tid,], xnew = xdf[vid, ], k =50,
#'                ftrend, loc = fsimilarity ,
#'                kriging = ~ lon + lat)
#'  print(fitk)
#'  sd(xdf[vid,'y'] - fitk$trend)
#'  sd(xdf[vid,'y'] - fitk$pred)
#'
#'  ifold0 <- sample(floor((1:n)/(n+1)*fold)+1)
#'
#'  ## Perform cross-validation. Long to compute
#'  ## This speed up the computation as there is no search for a best  model
#'  f <- FitRoi.cv(x = xdf, k = seq(20,150, 10), fold = ifold0,
#'                ftrend,  similarity = ~ lon + lat,
#'                verbose = TRUE, crit = 'mad')
#'

FitRoi <- function(x, xnew, k, trend, similarity, kriging = NULL, ker = TRUE, ...){

  x <- as.data.frame(x)
  xnew <- as.data.frame(xnew)

  ## Extract coordinate and compute euclidean distance
  crd <- rbind(model.frame(similarity, x),
               model.frame(similarity, xnew))

  h <- as.matrix(dist(crd))

  ## Extract coordinate of the spatial model
  if(!is.null(kriging))
    crd2 <- rbind(model.frame(kriging,x),
                  model.frame(kriging,xnew))

  ## Create a vector of neighborhood size
  if(length(k) == 1)
    nk <- rep(k,nrow(h))
  else
    nk <- k


  ## Reformat the data
  x <- data.frame(model.frame(trend,x))
  xnew <- data.frame(model.frame(delete.response(terms(trend)), xnew))

  train <- seq(nrow(x))
  valid <- seq(nrow(x)+1,nrow(crd))

  ## compute mean at target
  pred <- predSe <- rep(NA, length(valid))

  for(ii in seq_along(valid)){

    ## ## Delineate a neighborhood
    nn <- FindNearest(h[valid[ii],train], nk[valid[ii]])

    ## Fit local linear model. With weight if necessary.
    if(ker){
      ww <- 1-(h[valid[ii],nn]/max(h[valid[ii],nn]))^2
      fit <- lm(trend, data = cbind(x[nn,],ww = ww),weights = ww)
    } else{
      fit <- lm(trend, data = x[nn,])
    }

    out <- predict(fit, xnew[ii,], se.fit = TRUE)
    pred[ii] <- out$fit
    predSe[ii] <- out$se.fit
  }

  ans <- list(call = list(k = k, npred = length(valid), nsite = length(train),
                          ker = ker, kriging = kriging, trend = trend,
                          similarity = similarity))

  ## create a list for the output
  if(!is.null(kriging)){

    ## For all training set
    yhat <- yhatSe <- rep(NA, length(train))
    for(ii in train){

      ## Delineate a neighborhood
      nn <- FindNearest(h[ii,train], nk[ii])

      ## Fit local linear model. With weight if necessary.
      if(ker){
        ww <- 1-(h[ii,nn]/max(h[ii,nn]))^2
        fit <- lm(trend, data = cbind(x[nn,], ww = ww), weights = ww)
      } else {
        fit <- lm(trend, data = x[nn,])
      }

      out <- predict(fit, x[ii,], se.fit = TRUE)
      yhat[ii] <- out$fit
      yhatSe[ii] <- out$se.fit
    }

    ## compute residuals
    yres <- x[,1]-yhat

    ## Save known component
    ans$trend <- pred
    ans$trend.se <- predSe
    ans$fitted <- yhat
    ans$fitted.se <- yhatSe
    ans$resid <- yres

    ## Extract the coordinates for the residuals
    crd2 <- data.frame(yres = c(yres, rep(0,length(pred))), crd2)

    sp::coordinates(crd2) <- kriging

    ## Perform simple kriging on the residuals
    out <- automap::autoKrige(
            yres~1, input_data = crd2[train,],
            new_data = crd2[valid,], beta = 0, debug.level = 0,
            ...)

    ans$pred <- pred + out$krige_output$var1.pred
    ans$resid.se <- out$krige_output$var1.stdev
    ans$pred.se <- sqrt(predSe^2 + ans$resid.se^2)

    ans$vgm <- out$exp_var
    ans$model <- out$var_model

  } else {
    ans$pred <- pred
    ans$pred.se <- predSe
  }

  ## return
  class(ans) <- 'roi'
  ans
}

FindNearest <- function(h,n) order(h)[seq(n)]

#' @export
print.roi <- function(obj){
 cat('\n\nLocal regression\n')
 cat('\nNumber of sites:', obj$call$nsite)
 cat('\nNumber of targets:', obj$call$npred)

 cat('\n\nROI:')
 cat('\n  Trend:', format(obj$call$trend))
 cat('\n  size:', sort(unique(obj$call$k)))
 cat('\n  Similarity', format(obj$call$similarity))
 cat('\n  Kernel:', format(obj$call$ker))

 if(!is.null(obj$model)){
   cat('\n\nKriging:')
   cat('\n  Coord:', format(obj$call$kriging),'\n\n')
   print(as.data.frame(obj$model)[,1:4])
 }

}

#' @export
FitRoi.cv <- function(x, k, trend, similarity,
                      kriging = NULL, ker = TRUE,
                      crit = 'mad', fold = 5,
                      verbose = TRUE, ...){

  x <- as.data.frame(x)

  ## Get response variable
  yhat <- x[,all.vars(trend)[1]]
  n <- length(yhat)

  ## Create indexes of cross-validation group
  if(length(fold) == 1){
    ifold <- sample(floor((1:n)/(n+1)*fold)+1)
    fold <- 1:nfold
  } else {
    ifold <- as.integer(factor(fold))
    fold <- 1:length(unique(ifold))
  }

  ## Function that compute the cross-validation criteria for a given
  ## neighborhood size
  FunCv <- function(nk){

    ## Compute prediction by local regression
    pred <- rep(NA,n)
    for(jj in fold){

      pred[ifold == jj] <-
        FitRoi(x = x[ifold != jj,],
               xnew = x[ifold == jj,],
               k = nk,
               trend = trend,
               similarity = similarity,
               kriging = kriging,
               ker = ker, ...)$pred
    }

    ## Compute cross-validation criteria
    err <- yhat - pred
    rerr <- yhat/pred - 1

    list(pred  = pred,
         rmse  = sqrt(mean(err^2)),
         rrmse = sqrt(mean(rerr^2)),
         nsh   = 1 - sum(err^2)/sum((yhat-mean(yhat))^2),
         mad   = mean(abs(err)),
         rmad  = mean(abs(rerr)),
         smad  = 1 - sum(abs(err))/sum(abs(yhat-median(yhat))))
  }

  if(is.matrix(k))
    ntry <- ncol(k)
  else
    ntry <- length(k)

  cv <- data.frame(matrix(NA, ntry,6))
  colnames(cv) <- c('rmse','rrmse','nsh','mad','rmad','smad')
  best <- Inf

  if(verbose)
    bar <- txtProgressBar()

  for(ii in seq_along(k)){

    if(verbose)
      setTxtProgressBar(bar, ii/ntry)

    ## Get the cross-validation criteria for every sites
    if(is.matrix(k))
      fii <- FunCv(k[ii,])
    else
      fii <- FunCv(k[ii])

    cv[ii,] <- c(fii$rmse, fii$rrmse, fii$nsh,
                 fii$mad,  fii$rmad,  fii$smad)

    ## Keep the best model
    if(crit == 'rmse')
      cii <- fii$rmse
    else if(crit == 'rrmse')
      cii <- fii$rrmse
    else if(crit == 'nsh')
      cii <- fii$rrmse
    else if(crit == 'mad')
      cii <- fii$mad
    else if(crit == 'rmad')
      cii <- fii$rmad
    else if(crit == 'smad')
      cii <- fii$rmad

    if(cii < best){
      pred <- fii$pred
      best <- cii
    }
  }

  ## return
  ans <- list(cv = cbind(k,cv), pred = pred, crit = crit)
  class(ans) <- 'roiCv'
  ans
}

#' @export
print.roiCv <- function(obj){

  lstCrit <- c('rmse','rrmse','nsh','mad','rmad','smad')
  bid <- which(obj$crit == lstCrit)+1
  best <- order(obj$cv[,bid])[1:3]

  cat('\nCross-validation for local regression\n')
  cat('\nBest 3 sizes of neighborhood\n')
  print(obj$cv[best,])

}

#' @export
plot.roiCv <- function(obj,...){

  lstCrit <- c('rmse','rrmse','nsh','mad','rmad','smad')
  bid <- which(obj$crit == lstCrit)+1
  best <- which.min(obj$cv[,bid])

  form <- as.formula(paste(obj$crit,'~k'))
  plot(form, obj$cv, type = 'l', ...)
  points(obj$cv[best,1],obj$cv[best,bid], col = 2, pch = 16)
}