R/loadInterp.R

#' loadInterp is a class of load models that hold interpolation functions.

#### interpModel class ####

#' The engine of a loadInterp model.
#' 
#' This is a model class to nest within loadInterp. This class, interpModel, is 
#' lightweight relative to loadInterp: its focus is on the interpolation rather 
#' than on units conversions, fitting, or other things that loadInterp does 
#' better. This is also the class that is produced by a call to the fitting 
#' function stored in loadInterp.
#' 
#' @include loadModel.R
#' @include interpolations.R
#' @rdname interpModel-class
#' @name interpModel-class
#' @slot dates.class Class of the dates.in. 
#' @slot dates.in Dates of the y.in data.
#' @slot y.in Data (usually fluxes, concentrations, or residuals) to 
#'   interpolate.
#' @slot interp.function Function that accepts arguments \code{dates.in}, 
#'   \code{y.in}, and \code{dates.out}, then returns predictions \code{y.out}
#' @importFrom methods setClass
#' @exportClass interpModel
#' @family load.model.fits
setClass(
  "interpModel",
  
  slots=c(
    dates.class="character",
    dates.in="ANY",
    y.in="ANY",
    interp.function="ANY"),
  
  prototype=c(
    dates.class=NA_character_,
    dates.in=NULL,
    y.in=NULL,
    interp.function=NULL),
  
  validity=function(object) {
    errorstrs <- character()
    
    #dates.in and y.in should be coercible to numeric and should be the same
    #length in most cases, but we'll leave that error checking for the
    #interp.function to handle more gracefully
    
    if(!is(object@interp.function,"function")) {
      errorstrs <- c(errorstrs, "interp.function should be a function")
    }
    
    if(length(errorstrs) > 0) {
      errorstrs
    } else {
      TRUE
    }
  }
)

#### loadInterp class ####

#' A load model class specific to interpolations for flux estimation.
#' 
#' loadInterps use a variety of interpolation methods to connect predictions of
#' y values (usually fluxes, concentrations, or residuals) over time.
#' 
#' @slot fit the interpolation model to be used.
#' @slot MSE numeric. The mean squared error, i.e., the variance of prediction
#'   errors, probably as estimated by leave-one-out cross validation.
#' @importFrom methods setClass
#' @exportClass loadInterp
#' @family load.model.classes
setClass(
  "loadInterp", 
  
  slots=c( 
    fit="interpModel",
    MSE="matrix"),
  
  contains="loadModel",
  
  # Validity of parent objects is checked first, automatically
  validity=function(object) {
    errorstrs <- character()
    
    validObject(object@fit)
    
    # Return
    if(length(errorstrs) == 0) {
      TRUE
    } else {
      errorstrs
    }
  }    
)

#' Create a fitted loadInterp object.
#' 
#' Generates a new model of class loadInterp (\code{\link{loadInterp-class}}) 
#' which can iterpolate among observations of concentration or flux.
#' 
#' loadInterps are simple load models that predict concentration or flux based 
#' on one or more preceding and following measurements of flux. The specific 
#' interpolation method can be varied; examples include linear, spline, and 
#' triangular interpolations. See \link{interpolations} for the full list of 
#' pre-defined options; others may also be defined by the user.
#' 
#' loadInterps are currently assumed to have normally distributed residuals. An 
#' unwitting user might violate this assumption without being caught by the 
#' code, so be careful! This assumption is mainly relevant to the calculation of
#' confidence or prediction intervals. Also, where other models such as loadReg 
#' and loadLm will retransform predictions back into linear space, loadInterps 
#' will not.
#' 
#' @importFrom methods new
#' @param interp.format character. Which sort of observation should the 
#'   interpolations be done among?
#' @param interp.function function. The function to use for interpolation. 
#'   Pre-defined choices are described in \link{interpolations}; additional 
#'   functions may be defined by the user as long as they adhere to the 
#'   guidelines given there.
#' @param data data.frame. The data to be interpolated
#' @param metadata metadata, used to access the appropriate columns of data. At 
#'   a minimum, \code{metadata} should correctly specify the date column and the
#'   column indicated by \code{interp.format}.
#' @param retrans.function irrelevant to loadInterp and must be NULL. for other
#'   models, permits fitting in log or other transformed spaces.
#' @param store One or more character strings specifying which information to 
#'   write within the model. Options are 'data': the original fitting data; 
#'   'fitting.function': a fitting function that can produce a new loadComp 
#'   object from new data (this currently uses the same new data for both 
#'   regression calibration and interpolation); 'uncertainty': an estimate of 
#'   uncertainty, which can take some time to compute but will permit creation 
#'   of uncertainty intervals, etc. in the prediction and aggregation phases.
#' @return A fitted loadInterp model.
#' @export
#' @family load.model.inits
loadInterp <- function(interp.format=c("flux","conc"), interp.function=linearInterpolation, 
                       data, metadata, retrans.function=NULL, 
                       # subset, na.action,  # these should be included, but it'll take a little work
                       store=c("data","fitting.function","uncertainty")) {
  
  # Validate arguments
  interp.format <- match.arg.loadflex(interp.format)
  if(!isTRUE(is.null(retrans.function))) {
    stop("Methods for a non-NULL retrans.function aren't currently implemented.")
  }
  
  # If requested, generate fitting function
  if("fitting.function" %in% store) {
    fitting_function <- function(training.data, store=c()) {
      loadInterp(interp.format=interp.format, interp.function=interp.function, data=training.data, 
                 metadata=metadata, retrans.function=retrans.function, store=store)
    }
  }
   
  # Check and prepare elements of interpModel
  if(!(metadata@dates %in% names(data))) {
    stop("metadata@dates must be in names(data)")
  }
  dates_in <- getCol(metadata=metadata, data=data, field="date", attach.units=FALSE)
  # Record the date class but convert to a standard numeric form. Do this 
  # conversion now, rather than during prediction, to save time in cases where 
  # many predictions are made from the same data (prime example: in
  # estimateMSE()).
  dates_class <- class(dates_in)
  dates_in <- as.numeric(as.POSIXct(dates_in))
  # Use calculate=NA to calculate only if necessary
  y_in <- observeSolute(data, interp.format, metadata, calculate=NA, attach.units=FALSE)
  
  if(!is.null(retrans.function)) warning("retrans.function isn't currently implemented for loadInterp")
    
  # Create the model
  load.model <- new(
    "loadInterp", 
    fit=new("interpModel",
            dates.class=dates_class,
            dates.in=dates_in,
            y.in=y_in,
            interp.function=interp.function),   
    pred.format=interp.format,
    metadata=metadata,
    data=data,
    fitting.function=if("fitting.function" %in% store) fitting_function else NULL,
    retrans.function=retrans.function)
  
  # Compute and add in uncertainty if appropriate
  if("uncertainty" %in% store) {
    load.model@MSE <- estimateMSE(load.model, n.out=1, n.iter=length(dates_in), replace=FALSE) #LOOCV=jackknife
  }
  
  load.model
}

#' Make flux or concentration predictions from a loadInterp model.
#' 
#' Makes instantaneous predictions (at the temporal resolution of 
#' \code{newdata}) from a fitted \code{\link{loadInterp}} model. See 
#' \code{\link{predictSolute}} for details.
#' 
#' loadInterps are currently assumed to have normally distributed residuals. An 
#' unwitting user might violate this assumption without being caught by the 
#' code, so be careful! This assumption is mainly relevant to the calculation of
#' confidence or prediction intervals. Also, where other models such as loadReg 
#' and loadLm will retransform predictions back into linear space, loadInterps 
#' will not.
#' 
#' @importFrom stats qnorm
#' @inheritParams predictSolute
#' @param load.model A loadInterp object.
#' @param interval character. The type of interval desired. Confidence intervals
#'   are not currently available for loadInterp models.
#' @param se.fit logical, but should be FALSE because se.fit is not currently
#'   available for loadInterp models.
#' @param newdata \code{data.frame}, optional. Predictor data. Because 
#'   loadInterp models interpolate in time among the observations to which they 
#'   have been "fitted", \code{newdata} is usually simply a one-column 
#'   data.frame of dates or date-times. Column names should match those given in
#'   the \code{loadInterp} metadata. If \code{newdata} is not supplied, the 
#'   original fitting data will be used.
#' @return A vector of data.frame of predictions, as for the generic 
#'   \code{\link{predictSolute}}.
#' @export
#' @family predictSolute
predictSolute.loadInterp <- function(
  load.model, flux.or.conc, newdata, interval=c("none","confidence","prediction"), 
  level=0.95, lin.or.log=c("linear","log"), se.fit=FALSE, se.pred=FALSE, date=FALSE, 
  attach.units=FALSE, agg.by=c("unit", "day", "month", "water year", "calendar year", 
                               "total", "mean water year", "mean calendar year", 
                               "[custom]"), ...) {
  
  # Validate arguments
  flux.or.conc <- match.arg.loadflex(flux.or.conc)
  interval <- match.arg.loadflex(interval)
  lin.or.log <- match.arg.loadflex(lin.or.log)
  agg.by <- match.arg(agg.by)
  
  meta <- load.model@metadata
  fit <- load.model@fit
  
  # Use fitting data if newdata are not supplied
  if(missing(newdata)) {
    newdata <- getFittingData(load.model)
  }
  
  # Check that dates are present and in the right format
  dates.out <- getCol(metadata=meta, data=newdata, field="date")
  if(!(all(fit@dates.class == class(dates.out)))) {
    stop(paste0("dates in newdata must have the same class (", paste0(fit@dates.class, collapse=","), ") as dates in the fitting data"))
  }
  # Convert to POSIXct and then to numeric, yielding seconds since 1970 in
  # whatever time zone[s] result from applying the same conversions to both
  # dates.in and dates.out (dates.in processed during loadInterp() call).
  dates.out.numeric <- as.numeric(as.POSIXct(dates.out))
  
  # Do the interpolation
  preds_lin <- fit@interp.function(fit@dates.in, fit@y.in, dates.out.numeric)
  
  # Change flux/conc formats if appropriate
  preds_lin <- formatPreds(
    preds_lin, from.format=load.model@pred.format, to.format=flux.or.conc, 
    newdata=newdata, metadata=load.model@metadata, attach.units=attach.units)
  
  # Add uncertainty if requested. As noted in the documentation above, we're
  # assuming that prediction errors are normally distributed and that no
  # retransformation is necessary.
  if(interval != "none" | se.fit | se.pred) {
    # If there's any sort of uncertainty reporting, we'll need to return a
    # data.frame rather than a vector.
    preds_lin <- data.frame(fit=preds_lin)
    
    # The MSEs we're calculating are specific to the format (flux or conc), so
    # we'll look those up here rather than trying to convert from some
    # format-agnostic MSE into the desired format (that's impossible, turns
    # out). Check that the format we want is available; throw a warning if it's
    # not.
    if(interval == "confidence") {
      stop("confidence intervals not implemented for loadInterp")
    } 
    if(se.fit) {
      stop("se.fit not implemented for loadInterp")
    }
    if(isTRUE(all.equal(dim(load.model@MSE), c(0,0)))) {
      stop("Uncertainty estimates are unavailable. Try fitting the model with store=c('uncertainty').")
    }
    if(is.na(load.model@MSE["mean", flux.or.conc])) {
      stop("Uncertainty estimates are unavailable for ",flux.or.conc,". Try fitting the model with data that include discharge.")
    }
    
    # Now add uncertainty info
    se_lin <- sqrt(load.model@MSE["mean", flux.or.conc])
    preds_log <- linToLog(meanlin=preds_lin$fit, sdlin=se_lin) # we only need preds_log if lin.or.log='log'
    # confidence intervals
    if(interval == "prediction") {
      # degrees of freedom are not straightforward for interpolation models, so
      # use qnorm instead of qt to get quantiles.
      ci_quantiles <- qnorm(p=0.5+c(-1,1)*level/2)
      preds_lin$lwr <- preds_lin$fit + ci_quantiles[1]*se_lin 
      preds_lin$upr <- preds_lin$fit + ci_quantiles[2]*se_lin
      preds_log$lwr <- log(preds_lin$lwr) 
      preds_log$upr <- log(preds_lin$upr)
    }
    # The SEs:
    if(se.pred) {
      preds_lin$se.pred <- se_lin
      preds_log$se.pred <- preds_log$sdlog
    }
  }
  
  # Add dates if requested
  if(date) {
    if(!is.data.frame(preds_lin)) {
      preds_lin <- data.frame(fit=preds_lin)
    }
    # prepend the date column
    preds_lin <- data.frame(date=getCol(load.model@metadata, newdata, "date"), preds_lin)
  }
  
  preds <- preds_lin
  if(lin.or.log == "log") {
    if(is.data.frame(preds)) {
      preds$fit <- log(preds$fit) # this is NOT the mean in log space, but it's the only way to get residuals of 0 in log space
      preds$fit.meanlog <- preds_log$meanlog # include the mean in log space for a tiny bit more clarity
      preds$se.fit <- if(se.fit) NA else NULL
      preds$se.pred <- if(se.pred) preds_log$se.pred else NULL
      preds$lwr <- if(interval == "prediction") log(preds$lwr) else NULL
      preds$upr <- if(interval == "prediction") log(preds$upr) else NULL
    }
  }
  
  #use aggregate solute to aggregate to agg.by, but warn and return NA for uncertainty
  if(agg.by != "unit") {
    preds <- aggregateSolute(preds, metadata = getMetadata(load.model), agg.by = agg.by,
                             format = flux.or.conc, dates = dates.out)
  }
  
  return(preds)
}

# Of the required loadModelInterface functions, getMetadata, getFittingData, and
# getFittingFunction are inherited; the default methods work fine for 
# loadInterp.

#' Estimate uncertainty in an interpolation using leave-one-out cross 
#' validation.
#' 
#' Calculates and returns the mean squared errors (MSEs) in the units and 
#' transformation space (e.g., log space) of the left-hand side of the model fit
#' equation.
#' 
#' This method is leave-one-out cross validation (LOOCV) when n.out==1, 
#' n.iter==nrow(getFittingData(load.model)), and with.replacement=FALSE. This 
#' method is k-fold cross validation when 
#' n.out*n.iter==nrow(getFittingData(load.model)) and with.replacement==FALSE.
#' 
#' @importFrom stats sd
#' @inheritParams estimateMSE
#' @param n.out numeric. The number of observations in the fitting data to leave
#'   out in each iteration.
#' @param n.iter numeric. The number of iterations to perform.
#' @param replace logical. In each iteration, should the n.out observations that
#'   are left out be sampled with replacement of any previous sets of n.out
#'   observations (TRUE) or without replacement (FALSE)?
#' @export
#' @family estimateMSE
estimateMSE.loadInterp <- function(load.model, n.out, n.iter=floor(nrow(getFittingData(load.model))/n.out), replace, ...) {
  
  # Validate args
  replace <- match.arg.loadflex(replace, c(TRUE, FALSE))
  
  # Pull some model pieces from the model for quick access
  interpfun <- load.model@fit@interp.function # the innermost interpolation function, e.g., linearInterpolation
  dates.numeric <- load.model@fit@dates.in # dates.in == dates.out for MSE estimation; already converted in loadInterp() to numeric(POSIXct()).
  y.obs <- load.model@fit@y.in
  
  # Construct a matrix of rows IDs to leave out. The leave.out matrix has one
  # row per iteration, one column per value to leave out in each iteration. 
  # sample.int does the error checking to make sure we don't try to sample more
  # rows than there are when replace=FALSE
  if(!replace) {
    leave.out <- matrix(sample.int(n=length(dates.numeric), size=n.out*n.iter, replace=FALSE), ncol=n.out)
  } else {
    # if replace=TRUE, we still don't actually want to replace within
    # iterations. just allow replacement from one iteration to the next.
    leave.out <- do.call(rbind, replicate(n=n.iter, sample.int(n=length(dates.numeric), size=n.out, replace=FALSE), simplify=FALSE))
  }
  
  # Pre-calculate the conversion factors to make preds and obs in "conc" or "flux" format. Calls to formatPreds are expensive, so do them infrequently.
  is_flux_format <- load.model@pred.format == "flux"
  if(is_flux_format) {
    conc_factor <- tryCatch(
      formatPreds(rep(1, nrow(load.model@data)), from.format=load.model@pred.format, 
                  to.format="conc", newdata=load.model@data, metadata=load.model@metadata, attach.units=FALSE),
      error=function(e) rep(NA, nrow(load.model@data)))
  } else {
    flux_factor <- tryCatch(
      formatPreds(rep(1, nrow(load.model@data)), from.format=load.model@pred.format, 
                  to.format="flux", newdata=load.model@data, metadata=load.model@metadata, attach.units=FALSE),
      error=function(e) rep(NA, nrow(load.model@data)))
  }
  
  # Iterate through the rows of leave.out, each time leaving out some rows,
  # predicting values for those rows, and measuring MSE as the mean squared
  # difference between the predicted and known values for those rows
  residuals_MSE <- matrix(
    ncol=2, byrow=TRUE, dimnames=list(NULL, c("conc","flux")), 
    data=sapply(1:n.iter, function(i) {
      
      # Identify the rows to leave out
      left.out <- leave.out[i,]
      
      # Do the interpolation
      preds <- interpfun(dates.numeric[-left.out], y.obs[-left.out], dates.numeric[left.out])
      
      # Calculate the load rate described by the test observation
      obs <- y.obs[left.out]
      
      # Make conversions to conc or flux as needed. Not sure the if() statement
      # makes this faster; could run some tests.
      conc_resids <- (obs - preds) * if(is_flux_format) conc_factor[left.out] else 1
      flux_resids <- (obs - preds) * if(is_flux_format) 1 else flux_factor[left.out]
      
      # Calculate & return the MSEs for this iteration, conc followed by flux
      c(mean(conc_resids^2), mean(flux_resids^2))
      
    }))
  
  # Return the distribution of the MSE from all those leave-n-out interations in
  # a 2x2 matrix
  rbind(mean=apply(residuals_MSE, 2, mean), sd=apply(residuals_MSE, 2, sd))
  
}

#' Extract model summary statistics from a loadInterp model
#' 
#' Produce a 1-row data.frame of model metrics. The relevant metrics for 
#' loadInterp models include RMSE, p-values, and others TBD.
#' 
#' @md
#' @inheritParams summarizeModel
#' @param irregular.timesteps.ok logical. If FALSE, this function requires that 
#'   the timesteps between observations are identical to one another, and a plot
#'   is generated and an error is thrown if this requirement is not met. If 
#'   TRUE, the check is not performed. If NA (the default), the check is 
#'   performed but the function proceeds with a warning and no plot if the 
#'   timesteps are found to be irregular. Tests and estimates of autocorrelation
#'   are weak to wrong when timesteps are irregular, but timesteps are often at 
#'   least a bit irregular in the real world.
#' @return Returns a 1-row data frame with the following columns:
#'   
#'   * `site.id` - the unique identifier of the site, as in [metadata()]
#'   
#'   * `constituent` - the unique identifier of the constituent, as in 
#'   [metadata()]
#'   
#'   * `RMSE.lin`- the square root of the mean squared error
#'   
#'   * `durbin.watson` - the Durbin Watson test statistic as applied to the 
#'   observations used to fit the interpolation model
#'   
#'   * `rho`, `acf1`, `acf1demean`, `corlag` - measures of the autocorrelation
#'   of the observations used to fit the model
#' @importFrom dplyr select everything
#' @importFrom car durbinWatsonTest
#' @importFrom stats arima
#' @importFrom stats cor
#' @export
#' @family summarizeModel
summarizeModel.loadInterp <- function(
  load.model, irregular.timesteps.ok=NA, ...) {
  
  # prepare args we'll use a few times below
  int.data <- getFittingData(load.model)
  int.dates <- getCol(load.model@metadata, int.data, 'date')
  int.obs <- observeSolute(data=int.data, flux.or.conc=load.model@pred.format, metadata=load.model@metadata)
  
  # Assess the regularity of the time series as in residDurbinWatson and 
  # estimateRho. this code chunk could probably be consolidated into
  # isTimestepRegular someday
  timestep.tol <- .Machine$double.eps^0.5
  if(is.na(irregular.timesteps.ok)) {
    is_regular <- isTimestepRegular(int.dates, tol=timestep.tol, hist=FALSE, handler=warning)
  } else if(!irregular.timesteps.ok) {
    is_regular <- isTimestepRegular(int.dates, tol=timestep.tol, hist=TRUE, handler=function(e) { invisible() })
    if(!is_regular) {
      stop("Tests and estimates of autocorrelation are invalid for an irregular time series. Set irregular.timesteps.ok=TRUE to continue anyway.")
    }
  }
  
  # create a data.frame of model metrics
  out <- NextMethod(load.model, ...) # site.id, constituent, etc.
  out$RMSE.lin <- sqrt(load.model@MSE["mean", load.model@pred.format])
  # as in residDurbinWatson, Use the car package to test for autocorrelation of 
  # the residuals. Because load.model is not always a linear model, we'll simply
  # pass in the residuals and will accept the lack of p-values in the output
  out$durbin.watson <- car::durbinWatsonTest(model=int.obs)
  # as in estimateRho, extract the first-order autocorrelation coefficient, rho,
  # from an AR1 arima model. i think this is supported here:
  # http://stats.stackexchange.com/questions/68243/ar1-coefficient-is-correlation
  out$rho <- coef(arima(int.obs, order=c(1, 0, 0), include.mean=FALSE))[['ar1']]
  # could also include the lag-1 ACF value
  out$acf1 <- acf(int.obs, plot=FALSE, lag.max=1, demean=FALSE)$acf[2]
  out$acf1demean <- acf(int.obs, plot=FALSE, lag.max=1, demean=TRUE)$acf[2]
  out$corlag1 <- cor(int.obs[-1], int.obs[-length(int.obs)], method='pearson')
  
  # return
  return(out)
}
McDowellLab/loadflex documentation built on May 8, 2019, 9:48 a.m.