rloadest: River Load Estimation

Documented in loadReg

#' Load Estimation
#'
#' Build a rating-curve (regression) model for river load estimation.
#'
#' The left-hand side of the formula may be any numeric variable, just as with
#'\code{lm} or a variable of class "lcens," "mcens," or "qw." Also permitted are
#'variables constructed using \code{Surv} of \code{type} "right,", "interval," or
#'"interval2" (for left-censored data, use \code{as.lcens}.\cr
#'For un- or left-censored data, AMLE is used unless weights are specified in
#'the model, then MLE is used, through a call to \code{survreg}. For any other 
#'censored data, MLE is used.\cr
#'
#'Typically, \code{loadReg} expects the response variable to have units of
#'concentration, mass per volume. For these models, See \code{\link{loadConvFactor}}
#'for details about valid values for \code{flow.units}, \code{conc.units} and 
#'\code{load.units}. For some applications, like bed load estimation, the response
#'variable can have units of flux, mass per time. For these models, \code{conc.units}
#'can be expressed as any valid \code{load.units} per day. The rate must be expressed
#'in terms of "/d," "/dy," or "/day."
#'
#' @param formula a formula describing the regression model. See \bold{Details}.
#' @param data the data to search for the variables in \code{formula}.
#' @param subset an expression to select a subset of the data.
#' @param na.action what to do with missing values.
#' @param flow character string indicating the name of the 
#'flow column.
#' @param dates character string indicating the name of the 
#'date column.
#' @param flow.units character string describing the flow units. See \bold{Details}.
#' @param conc.units character string describing the concentration 
#'unit. See \bold{Details}.
#' @param load.units character string describing the load unit. See \bold{Details}.
#' @param time.step character string describing the time step of 
#'the calibration data. Must be one of "instantaneous," "2 hours," "3 hours,"
#'"4 hours," "6 hours," "12 hours," or "day." The default is "day."
#' @param station character string description of the station.
#'
#' @return An object of class "loadReg."
#' @seealso \code{\link{censReg}}, \code{\link{as.lcens}}, \code{\link{as.mcens}},
#'\code{\link{Surv}}
#' @references
#'Runkel, R.L., Crawford, C.G., and Cohn, T.A., 2004, Load estimator (LOADEST):
#'a FORTRAN program for estimating constituent loads in streams and rivers: U.S.
#'Geological Survey Techniques and Methods book 4, chap. A5, 69 p.
#' @keywords regression censored loads
#' @examples
#'# From application 1 in the vignettes
#'data(app1.calib)
#'app1.lr <- loadReg(Phosphorus ~ model(1), data = app1.calib, 
#'  flow = "FLOW", dates = "DATES", conc.units="mg/L",
#'  station="Illinois River at Marseilles, Ill.")
#'print(app1.lr)
#' @import smwrBase
#' @import smwrQW
#' @export
loadReg <- function(formula, data, subset, na.action, flow, dates,
                    flow.units="cfs", conc.units="", load.units="kg", 
                    time.step="day", station="") {
  ## Function to check if conc.units are in terms of loads
  ## returns logical with attribute of loads if TRUE
  ckLoad <- function(units) {
    # check if rate is /d, /dy, or /day (actually any /d)
    retval <- grepl("/d", units)
    if(retval) { # strip off rate
      attr(retval, "load") <- sub("/d.*", "", units)
    }
    return(retval)
  }
  
  #tbls will cause errors later - convert to standard data.frame
  if(any(grepl(pattern = "tbl", x = class(data)))){
    data <- data.frame(data)
  }
  ##
  ## Trap model number specification
  PredMod <- terms(formula, "model", data = data)
  indMod <- attr(PredMod, "specials")$model
  time.step <- match.arg(time.step, 
                         c("instantaneous", "2 hours",
                         "3 hours", "4 hours", "6 hours",
                         "8 hours", "12 hours", "day"))
  call <- match.call()
  m <- match.call(expand.dots = FALSE)
  Terms <- attr(m, "terms")
  ## Make sure that the formula is a formula and not a symbol--this
  # improves output and subsequent formula references
  if (typeof(call$formula) == "symbol") {
    #terms above is null 
    #formulas were not being converted from symbols, and causing errors at ~127
    call$formula <- formula
    m[['formula']] <- formula
  }
  ## remove components not needed for model.frame
  m$flow <- m$dates  <- m$flow.units <- m$conc.units <- NULL
  m$load.units <- m$time.step <- m$station <- NULL
  m[[1L]] <- as.name("model.frame")
  if(is.null(indMod)) { # User-defined model
    model.no <- 99L
    m <- eval(m, parent.frame())
    Terms <- attr(m, "terms")
    xvars <- as.character(attr(Terms, "variables"))[-1L] # drop list
    Y <- model.extract(m, "response")
    X <- model.matrix(Terms, m)
    if((yvar <- attr(Terms, "response")) > 0L) {
      ynam <- xvars[yvar]
      xvars <- xvars[ - yvar]
    }
    if(length(xvars) > 0L) {
      xlevels <- lapply(m[xvars], levels)
      xlevels <- xlevels[!unlist(lapply(xlevels, is.null))]
      if(length(xlevels) == 0L)
        xlevels <- NULL
    }
    else xlevels <- NULL
    na.message <- attr(m, "na.message")
    saved.na.action <- attr(m, "na.action")
    if(missing(subset)) 
      subset <- seq(nrow(data))
    else
      subset <- eval(substitute(subset), data)
    Flow <- data[subset, flow]
    Time <- data[subset, dates]
    if(!is.null(saved.na.action)) {
      Flow <- Flow[-saved.na.action]
      Time <- Time[-saved.na.action]
    }
    PoR <- range(Time)
    Qadj <- loadestQadj(Flow)
    Tadj <- loadestTadj(Time)
  } else { # predefined model
    ## In fill the needed parameters from the call
    m[["formula"]] <- eval(m[["formula"]])
    model.no <- m[["formula"]][[3L]][[2L]]
    m[["formula"]][[3L]][[3L]] <- data
    m[["formula"]][[3L]][[4L]] <- flow
    m[["formula"]][[3L]][[5L]] <- dates
    m <- eval(m, parent.frame())
    Terms <- attr(m, "terms")
    xvars <- as.character(attr(Terms, "variables"))[-1L] # drop list
    if(length(xvars) > 2L) # Should actually generate an error before here
      stop("If model is used, then no other explanatory variables can be specified")
    Y <- model.extract(m, "response")
    subset <- m[[2L]]
    if((yvar <- attr(Terms, "response")) > 0L) {
      ynam <- xvars[yvar]
    }
    na.message <- attr(m, "na.message")
    saved.na.action <- attr(m, "na.action")
    xlevels <- NULL
    ## Construct the model matrix
    Flow <- data[subset, flow]
    Time <- data[subset, dates]
    PoR <- range(Time)
    if(class(time)[1] == "Date")
      Time <- Time + 0.5
    Qadj <- loadestQadj(Flow)
    Tadj <- loadestTadj(Time)
    X <- setXLDat(data[subset,], flow, dates, Qadj, Tadj, model.no)
    xvars <- colnames(X)[-1L]
  }
  if(class(Y) == "numeric") {
    Y <- as.lcens(Y)
  } else if(class(Y) == "qw") {
    ## If conc.units are not specified, then try to get from object
    if(conc.units == "") {
      conc.units <- unique(Y@reporting.units)
      conc.units <- conc.units[!is.na(conc.units)]
      if(length(conc.units) == 0L)
        conc.units <- ""
      else if(length(conc.units) > 1L) {
        ## Select the first one with a length greater than 1
        lens <- nchar(conc.units)
        conc.units <- conc.units[which(lens > 1L)[1L]]
      }
      ## Now make sure that as ... gets dropped
      conc.units <- strsplit(conc.units, " ")[[1L]][1L]
    }
    ## Required to fix the inability of model extraction to select all required parts
    if(!is.null(saved.na.action)) {
      Y@reporting.level <- Y@reporting.level[-saved.na.action]
      Y@remark.codes <- Y@remark.codes[-saved.na.action]
    }
    if(censoring(Y) == "multiple")
      Y <- as.mcens(Y)
    else
      Y <- as.lcens(Y)
  } else if(class(Y) == "Surv") {
    type <- attr(Y, "type")
    if(type == "left")
      stop("Use as.lcens for left-censored data")
    if(type == "interval") {
      # For interval data, time 2 is only important for interval censored data:
      # time1 contains the uper/lower limit for left- or right-censored data
      Ymat <- unclass(Y)
      Ystat <- as.integer(Ymat[, 3L])
      if(any(Ystat) > 2L # interval
         || any(Ystat) < 1L ) { # right
        # Convert to mcens, first left-censored: swap columns
        Ymat[, 2L] <- ifelse(Ystat == 2L, Ymat[, 1L], Ymat[, 2L]) 
        Ymat[, 1L] <- ifelse(Ystat == 2L, -Inf, Ymat[, 1L])
        # right-censored: set time2 to Inf
        Ymat[, 2L] <- ifelse(Ystat == 0L, Inf, Ymat[, 2L])
        Y <- as.mcens(Ymat[, 1L], Ymat[, 2L])
      } else # Only left-censored
        Y <- as.lcens(Y[, 1L], censor.codes=Ystat==2L)
    } else
      stop("Invalid Surv type: ", type)
  } else if(class(Y) == "lcens" && !is.null(saved.na.action))
    Y@censor.codes <- Y@censor.codes[-saved.na.action]
  ## Logic checks for time.step and class of Time
  if(class(Time)[1L] != "Date" && time.step == "day")
    warning("For time.step = \"day,\" the class of dates should be \"Date\"")
  else if(class(Time)[1L] == "Date" && time.step != "day")
    warning("For time.step less than \"day,\" the class of dates should be \"POXIXt\"")
  if(time.step == "day") { # Check that no duplicate days and appropriate gap
    Time <- as.Date(Time)
    Tdifs <- c(999L, diff(as.integer(Time)))
    if(any(Tdifs == 0L)) {
      Dys <- as.character(unique(Time[Tdifs == 0L]))
      Dys <- paste(Dys, collapse="\n")
      stop("Duplicated days not permitted for time.step = \"day\"\n", Dys)
    }
    Tdys <- min(Tdifs)
    if(Tdys < 7)
      warning("The minimum spacing between daily loads is ", signif(Tdys, 2L),
              " days. The time between observations should be at least ",
              " 7 days to avoid autocorrelation issues.")
  } else { # Need unit checks too
    Tdys <- difftime(Time, shiftData(Time), units="days")
    Tdys <- min(Tdys, na.rm=TRUE)
    if(Tdys < 7)
      warning("The minimum spacing between observed loads is ", signif(Tdys, 2L),
              " days. The time between observations should be at least ",
              " 7 days to avoid autocorrelation issues.")
  }
  ## Checks on concentration units
  if(conc.units == "") {
    warning("Concentration units assumed to be mg/L")
    conc.units <- "mg/L"
  }
  load.only <- ckLoad(conc.units)
  if(load.only) {
    cfit <- NULL
    ## Prepare for load model fit
    CF <- loadUnitConv(attr(load.only, "load"), load.units)
    Y <- Y * CF
  } else {
    ## OK, construct the concentration fit
    if(class(Y) == "lcens") {
      cfit <- censReg_AMLE.fit(Y, X, "lognormal")
      fit1 <- censReg_AMLE.fit(Y, X[,1L, drop=FALSE], "lognormal")
    } else {
      cfit <- censReg_MLE.fit(Y, X, rep(1, length(Y)), "lognormal")
      fit1 <- censReg_MLE.fit(Y, X[,1L, drop=FALSE], 
                              rep(1, length(Y)), "lognormal")
    }
    cfit$LLR1 <- fit1$LLR
    cfit$call <- call
    cfit$terms <- Terms
    cfit$na.action <- saved.na.action
    cfit$na.message <- na.message
    cfit$xlevels <- xlevels
    class(cfit) <- "censReg"
    ## Now prepare the load model fit
    CF <- loadConvFactor(flow.units, conc.units, load.units)
    Y <- Y * CF * Flow
  }
  if(class(Y) == "lcens") {
    lfit <- censReg_AMLE.fit(Y, X, "lognormal")
    fit1 <- censReg_AMLE.fit(Y, X[,1L, drop=FALSE], "lognormal")
    method <- "AMLE"
  } else {
    lfit <- censReg_MLE.fit(Y, X, rep(1, length(Y)), "lognormal")
    fit1 <- censReg_MLE.fit(Y, X[,1L, drop=FALSE], 
                             rep(1, length(Y)), "lognormal")
    method <- "MLE"
  }
  lfit$LLR1 <- fit1$LLR
  lfit$call <- call
  lfit$terms <- Terms
  lfit$na.action <- saved.na.action
  lfit$na.message <- na.message
  lfit$xlevels <- xlevels
  class(lfit) <- "censReg"
  ## Construct the evaluation data frame
  ## Capture errors
  if(lfit$IERR > 0L) {
    lfit$AIC <- lfit$SPPC <- Inf
    lfit$LLR <- -Inf
  }
  MEC <- data.frame(model=model.no, AIC=lfit$AIC, SPCC=lfit$SPPC)
  retval <- list(station=station, constituent=ynam, 
                 flow=flow, dates=dates, Qadj=Qadj,
                 Tadj=Tadj, model.no=model.no, model.eval=MEC,
                 method=method, conc.units=conc.units, Time=Time,
                 Sum.flow=summary(Flow), flow.units=flow.units,
                 load.units=load.units, PoR=PoR, xvars=xvars,
                 lfit=lfit, cfit=cfit, time.step=time.step)
  class(retval) <- "loadReg"
  return(retval)
}