R/loadComp.R

Defines functions getCorrectionFraction.loadComp getCorrectionFraction summarizeModel.loadComp estimateMSE.loadComp predictSolute.loadComp loadComp

Documented in estimateMSE.loadComp getCorrectionFraction getCorrectionFraction.loadComp loadComp predictSolute.loadComp summarizeModel.loadComp

#' #### compModel class ####

#' The engine of a loadInterp model.
#'
#' This model class is designed to nest within loadComp objects. The compModel
#' class is actually a container for two inner models implementing the loadModel
#' interface: reg.model and resid.model. reg.model will usually be a regression
#' model, often of type loadReg. resid.model is typically a loadInterp, used to
#' interpolate among the residuals of the reg.model predictions.
#'
#' @include loadModel.R
#' @rdname compModel-class
#' @name compModel-class
#' @slot reg.model The regression model - any model that implements the
#'   loadModelInterface.
#' @slot resid.model The interpolation model - any model that implements the
#'   loadModelInterface.
#' @slot log.resids Should residuals be computed in log space (TRUE) or
#'   linear space (FALSE)?
#' @slot abs.or.rel.resids character specifying whether interpolation should be
#'   by absolute residuals (obs-pred) or relative residuals (obs/pred).
#' @importFrom methods setClass
#' @exportClass compModel
#' @family load.model.fits
setClass(
  "compModel",

  slots=c(
    reg.model="ANY",
    resid.model="ANY",
    log.resids="logical",
    abs.or.rel.resids="character"),

  prototype=c(
    reg.model=NULL,
    resid.model=NULL,
    log.resids=NA,
    abs.or.rel.resids=NA),

  validity=function(object) {
    errorstrs <- character()

    if(length(errorstrs) > 0) {
      errorstrs
    } else {
      TRUE
    }
  }
)

#### loadComp class ####

#' A load model class implementing the composite method for flux estimation.
#'
#' @importFrom methods setClass
#' @exportClass loadComp
#' @family load.model.classes
setClass(
  "loadComp",

  slots=c(
    fit="compModel",
    MSE="matrix"),

  contains="loadModel",

  # Validity of parent objects is checked first, automatically
  validity=function(object) {
    errorstrs <- character()

    # Return
    if(length(errorstrs) == 0) {
      TRUE
    } else {
      errorstrs
    }
  }
)


#' Create a fitted loadComp object.
#'
#' Generates a new model of class loadComp (\code{\link{loadComp-class}}).
#' loadComps themselves contain two inner load models, one for regression and
#' one for interpolation of the residuals of the regression predictions.
#'
#' @include getResiduals.R
#' @importFrom methods new
#' @param reg.model The model, usually a regression model, to whose predictions
#'   the residuals corrections should be added.
#' @param interp.format character specifying the load format in which residuals
#'   should be interpolated
#' @param interp.data the dataset, possibly differing from
#'   getFittingData(reg.model), from which regression residuals will be
#'   calculated and interpolated.
#' @param interp.function a function accepting args dates.in, y.in, and
#'   dates.out and returning y.out. See \link{interpolations} for pre-defined
#'   options, or write your own having the same form.
#' @param abs.or.rel.resids Should residuals be computed as the difference
#'   ("absolute") or the ratio ("relative") of the observed and predicted
#'   values?
#' @param use.log logical. Should residuals be computed in log space (TRUE) or
#'   linear space (FALSE)?
#' @param MSE.method character. The method by which the model should be
#'   bootstrapped. "non-parametric": resample with replacement from the original
#'   fitting data, refit the model, and make new predictions. "parametric":
#'   resample the model coefficients based on the covariance matrix originally
#'   estimated for those coefficients, then make new predictions.
#' @param store One or more character strings specifying which information to
#'   write within the model. Options are 'data': the original fitting data;
#'   'fitting.function': a fitting function that can produce a new loadComp
#'   object from new data (this currently uses the same new data for both
#'   regression calibration and interpolation); 'uncertainty': an estimate of
#'   uncertainty, which can take some time to compute but will permit creation
#'   of uncertainty intervals, etc. in the prediction and aggregation phases.
#' @inheritParams estimateMSE.loadComp
#' @return A fitted loadComp model.
#' @export
#' @family load.model.inits
loadComp <- function(reg.model,
                     interp.format=c("flux","conc"), abs.or.rel.resids=c("absolute","relative"), use.log=TRUE,
                     interp.data, interp.function=linearInterpolation, store=c("data","fitting.function"),
                     n.iter=100, MSE.method="parametric") {

  # Validate arguments
  interp.format <- match.arg.loadflex(interp.format)
  abs.or.rel.resids <- match.arg.loadflex(abs.or.rel.resids)
  store <- match.arg.loadflex(store, c("data","fitting.function","uncertainty"), several.ok=TRUE)

  # Determine new metadata from several possible sources of info
  metadata <- getMetadata(reg.model) # start with metadata attached to reg.model

  # CM step 1 - get residuals
  resids <- getResiduals(load.model=reg.model, flux.or.conc=interp.format, abs.or.rel.resids=abs.or.rel.resids, use.log=use.log, newdata=interp.data)$Resid

  # CM step 2 - train interpolation model on residuals
  resid_model <- loadInterp(
    interp.format=interp.format, interp.function=interp.function,
    data=data.frame(interp.data, Resid=resids),
    metadata=if(interp.format=="flux") {
      updateMetadata(metadata=metadata, load.rate="Resid")
    } else {
      updateMetadata(metadata=metadata, constituent="Resid")
    },
    retrans.function=NULL,
    store=intersect(store, c("data","fitting.function")))

  # If requested, generate fitting function
  if("fitting.function" %in% store) {
    fitting_function <- function(training.data, store=c()) {
      # create & return new loadComp object based on a refitted regression model
      loadComp(reg.model=getFittingFunction(reg.model)(training.data, store=store),
               interp.format=interp.format, abs.or.rel.resids=abs.or.rel.resids,
               interp.data=training.data, interp.function=interp.function, store=store)
    }
  }

  # create new loadComp object
  load.model <- new(
    "loadComp",
    fit=new("compModel", reg.model=reg.model, resid.model=resid_model, log.resids=use.log, abs.or.rel.resids=abs.or.rel.resids),
    pred.format=interp.format,
    metadata=metadata,
    data=if("data" %in% store) interp.data else NULL,
    fitting.function=if("fitting.function" %in% store) fitting_function else NULL)

  # If requested, compute & store uncertainty from the loadComp object
  if("uncertainty" %in% store) {
    load.model@MSE <- estimateMSE(load.model, n.iter=n.iter, method=MSE.method)
  }

  load.model
}


#' Make flux or concentration predictions from a loadComp model.
#'
#' Makes instantaneous predictions (at the temporal resolution of
#' \code{newdata}) from a fitted \code{\link{loadComp}} model. See
#' \code{\link{predictSolute}} for details.
#'
#' @importFrom stats qnorm
#' @import dplyr
#' @inheritParams predictSolute
#' @param load.model A loadComp object.
#' @param newdata \code{data.frame}, optional. Predictor data, including any
#'   predictor columns needed for the regression model and any needed for the
#'   interpolation model. Column names should match those given in the
#'   \code{loadComp} metadata. If \code{newdata} is not supplied, the original
#'   fitting data will be used.
#' @param fit.reg logical. If TRUE, a column for the regression predictions
#'   before interpolation will be included in the data.frame that is returned.
#'   These will be in the same format (non-log, conc/flux) as the final
#'   predictions.
#' @param fit.resid logical. If TRUE, a column for the residual corrections will
#'   be included in the data.frame that is returned. These will be in the same
#'   format (non-log, conc/flux) as the final predictions, even when the
#'   residuals were actually produced in log space and/or as relative residuals,
#'   so that fit = fit.reg + fit.resid.
#' @param fit.resid.raw logical. If TRUE, a column for the residual corrections
#'   as returned from the interpolation model. These may be in log space and/or
#'   unitless, depending on the type of residual correction specified when the
#'   loadComp model was created.
#' @return A vector of data.frame of predictions, as for the generic
#'   \code{\link{predictSolute}}.
#' @export
#' @family predictSolute
predictSolute.loadComp <- function(
  load.model, flux.or.conc=c("flux","conc"), newdata=getFittingData(load.model),
  date=TRUE, count=!identical(agg.by,"unit"), se.fit=FALSE, se.pred=FALSE,
  interval=c("none","confidence","prediction"), level=0.95, lin.or.log=c("linear","log"),
  agg.by=c("unit", "day", "month", "water year", "calendar year", "total", "[custom]"),
  na.rm=FALSE, attach.units=FALSE,
  fit.reg=FALSE, fit.resid=FALSE, fit.resid.raw=FALSE, ...) {

  # Validate arguments
  flux.or.conc <- match.arg.loadflex(flux.or.conc)
  interval <- match.arg.loadflex(interval)
  attach.units <- match.arg.loadflex(attach.units)
  lin.or.log <- match.arg.loadflex(lin.or.log)
  if(missing(agg.by)) agg.by <- "unit"
  if(missing(count)) count <- !(identical(agg.by, "unit"))
  match.arg.loadflex(fit.reg, c(TRUE, FALSE))
  match.arg.loadflex(fit.resid, c(TRUE, FALSE))
  match.arg.loadflex(fit.resid.raw, c(TRUE, FALSE))

  metadata <- getMetadata(load.model)

  # Make the prediction, always in the original (as-interpolated) format
  if(missing(newdata)) {
    newdata <- getFittingData(load.model@fit@reg.model)
  }
  reg_preds <- predictSolute(
    load.model=load.model@fit@reg.model, flux.or.conc=load.model@pred.format,
    newdata=newdata, attach.units=attach.units,
    date=FALSE, count=FALSE, se.fit=FALSE, se.pred=FALSE, ...)
  resid_preds <- predictSolute(
    load.model=load.model@fit@resid.model, flux.or.conc=load.model@pred.format,
    newdata=newdata, attach.units=attach.units,
    date=FALSE, count=FALSE, se.fit=FALSE, se.pred=FALSE, ...)

  # Do the composite correction: combine reg_preds with resid_preds. resid_preds
  # may be logged or not, absolute or relative. reg_preds will always be in
  # pred.format, non-log space. So convert resid_preds and the combination of
  # reg_preds with resid_preds as needed. 4 possibilities.
  preds <- switch(
    load.model@fit@abs.or.rel.resids,
    "absolute"=(if(load.model@fit@log.resids) log(reg_preds) else reg_preds) + resid_preds,
    "relative"=(if(load.model@fit@log.resids) log(reg_preds) else reg_preds) * resid_preds)

  # If we aren't already in linear space, get there. No bias correction at this
  # stage because (1) It's not clear the interpolation phase is biased (already
  # corrected in regression prediction phase), and (2) if we did bias correction
  # here, the observations and predictions wouldn't match even for
  # linearInterpolation and others that are supposed to run straight through
  # each residual.
  predvec_lin <- if(load.model@fit@log.resids) exp(preds) else preds

  # Change flux/conc formats if appropriate. MSEs are already formatted.
  predvec_lin <- formatPreds(
    predvec_lin, from.format=load.model@pred.format, to.format=flux.or.conc,
    newdata=newdata, metadata=load.model@metadata, attach.units=attach.units)

  # Add uncertainty if requested
  if(interval != "none" | se.fit | se.pred | lin.or.log == 'log') {
    # Do early checks to send useful messages if we can't supply the requested
    # type of uncertainty
    if(interval == "confidence") {
      stop("Confidence intervals not implemented for loadComp models")
    }
    if(se.fit) {
      stop("se.fit not available for loadComp models")
    }
    if(isTRUE(all.equal(dim(load.model@MSE), c(0,0)))) {
      if(se.pred) {
        stop("Uncertainty estimates were not stored with the model. ",
             "Refit with 'uncertainty' in 'store' argument to enable se.pred=TRUE")
      }
      warning("Uncertainty estimates were not stored with the model, which produces NAs when lin.or.log='log'. ",
              "Refit with 'uncertainty' in 'store' argument to enable predictions")
      load.model@MSE <- matrix(NA, nrow=2, ncol=2, dimnames=list(c('mean','sd'), c('conc','flux')))
    } else if(is.na(load.model@MSE["mean", flux.or.conc])) {
      stop("Uncertainty estimates are unavailable for ",flux.or.conc,". Try fitting the model with data that include discharge.")
    }

    # Compute uncertainty using a lognormal distribution if log.resids==TRUE,
    # normal elsewise. Although I expect a lognormal distribution will make
    # sense most often, the estimateMSE routine often (rightly) gives NaN values
    # for meanlog when log.resids==FALSE, because it is possible in that case
    # for predictions to be negative (and therefore for their log to be NaN).
    # This all points in favor of preferring log.resids==TRUE, which I was
    # leaning toward anyway because in general load estimates ARE lognormal,
    # more or less.
    if(load.model@fit@log.resids) {
      # Compute uncertainty assuming a lognormal distribution of error around
      # each prediction. The MSE comes in log space when
      # load.model@fit@log.resids==TRUE. Since flux and conc are identical here,
      # it doesn't matter which column we use - just use the first.
      se_log <- sqrt(load.model@MSE["mean", 1])
      # The mean: always use se.pred (rather than se.fit) to calculate the mean in
      # log space.
      preds_log <- mixedToLog(meanlin=predvec_lin, sdlog=se_log)
      preds_lin <- logToLin(mslist=preds_log)

      # The intervals:
      if(interval == "prediction") {
        # degrees of freedom are not straightforward for the interpolation part of
        # the model, so use qnorm instead of qt to get quantiles.
        ci_quantiles <- qnorm(p=0.5+c(-1,1)*level/2)
        preds_log$lwr.pred <- preds_log$meanlog + ci_quantiles[1]*se_log
        preds_log$upr.pred <- preds_log$meanlog + ci_quantiles[2]*se_log
        preds_lin$lwr.pred <- exp(preds_log$lwr.pred)
        preds_lin$upr.pred <- exp(preds_log$upr.pred)
      }
      # The SEs:
      if(se.pred) {
        preds_log$se.pred <- se_log
        preds_lin$se.pred <- preds_lin$sdlin
      }
    } else {
      # Compute uncertainty assuming a normal distribution of error around each
      # prediction
      se_lin <- sqrt(load.model@MSE["mean", flux.or.conc])
      preds_lin <- data.frame(meanlin=predvec_lin, sdlin=se_lin)
      preds_log <- linToLog(mslist=preds_lin) # we only need preds_log if lin.or.log='log'

      # The intervals:
      if(interval == "prediction") {
        # degrees of freedom are not straightforward for the interpolation part of
        # the model, so use qnorm instead of qt to get quantiles.
        ci_quantiles <- qnorm(p=0.5+c(-1,1)*level/2)
        preds_lin$lwr.pred <- preds_lin$meanlin + ci_quantiles[1]*se_lin
        preds_lin$upr.pred <- preds_lin$meanlin + ci_quantiles[2]*se_lin
        preds_log$lwr.pred <- log(preds_lin$lwr.pred)
        preds_log$upr.pred <- log(preds_lin$upr.pred)
      }
      # The SEs:
      if(se.pred) {
        preds_lin$se.pred <- preds_lin$sdlin
        preds_log$se.pred <- preds_log$sdlog
      }
    }

    # Format the output
    preds_lin$sdlin <- NULL # we've copied this to the se.pred column if we wanted it
    names(preds_lin)[1] <- "fit" # name consistently with other predictSolute outputs
  } else {
    # If we're not returning any uncertainty info, format as a vector rather than as a data.frame
    preds_lin <- predvec_lin
  }

  # Add dates if requested
  if(date) {
    if(!is.data.frame(preds_lin)) {
      preds_lin <- data.frame(fit=preds_lin)
    }
    # prepend the date column
    preds_lin <- data.frame(date=getCol(load.model@metadata, newdata, "date"), preds_lin)
  }

  # Add intermediate predictions (regression, residuals in linear and/or
  # original form) if requested
  if(fit.reg | fit.resid | fit.resid.raw) {
    if(!is.data.frame(preds_lin)) {
      preds_lin <- data.frame(fit=preds_lin)
    }
    fit_reg <- if(fit.reg || fit.resid) {
      formatPreds(
        reg_preds, from.format=load.model@pred.format, to.format=flux.or.conc,
        newdata=newdata, metadata=load.model@metadata, attach.units=attach.units)
    }
    fit_resid <- if(fit.resid) {
      preds_lin$fit - fit_reg
    }
    fit_resid_raw <- if(fit.resid.raw) {
      resid_preds
    }
    preds_lin <- data.frame(
      preds_lin,
      c(if(fit.reg) list(fit.reg=fit_reg),
        if(fit.resid) list(fit.resid=fit_resid),
        if(fit.resid.raw) list(fit.resid.raw=fit_resid_raw)))
  }

  preds <- preds_lin
  if(lin.or.log == "log") {
    if(is.data.frame(preds)) {
      preds$fit <- log(preds$fit) # this is NOT the mean in log space, but it's the only way to get residuals of 0 in log space
      preds$fit.meanlog <- preds_log$meanlog # include the mean in log space for a tiny bit more clarity
      preds$se.fit <- if(se.fit) NA else NULL
      preds$se.pred <- if(se.pred) preds_log$se.pred else NULL
      preds$lwr.pred <- if(interval == "prediction") log(preds$lwr.pred) else NULL
      preds$upr.pred <- if(interval == "prediction") log(preds$upr.pred) else NULL
    }
  }

  # use aggregate solute to aggregate to agg.by, but warn and return NA for uncertainty if it was requested
  if(agg.by != "unit") {
    preds <- aggregateSolute(preds, metadata = getMetadata(load.model), agg.by = agg.by,
                             format = flux.or.conc, dates = getCol(load.model@metadata, newdata, "date"))
    if(interval != "none" || se.fit || se.pred) {
      warning("Uncertainty for aggregated predictions is unavailable for loadComp models; returning NAs")
    } else {
      extra_cols <- intersect(names(preds), c("SE", "CI_lower", "CI_upper"))
      preds <- preds %>% dplyr::select(-!!extra_cols)
    }
  }

  # Rename the `fit` column to describe the type of prediction
  if(is.data.frame(preds)) {
    names(preds) <- replace(names(preds), names(preds)=='fit', flux.or.conc)
  }

  # Return
  return(preds)
}



#' Estimate uncertainty in a composite model.
#'
#' Calculates and returns the mean squared errors (MSEs) in the units and
#' transformation space (e.g., log space) of the left-hand side of the model fit
#' equation.
#'
#' This method is leave-one-out cross validation (LOOCV) for the interpolation
#' and involves repeated resampling of the coefficients from which the
#' regression predictions and residuals are calculated.
#'
#' @importFrom stats sd
#' @inheritParams estimateMSE
#' @param n.iter The number of times to repeat the COMPLETE process of [simulate
#'   predictions from the regression model and do leave-one-out cross validation
#'   (for all interpolation data points)]. Each run through the process
#'   generates one estimate of the MSE, from which a mean and SD of the MSE
#'   estimates will be returned.
#' @param method  character. The method by which the model should be
#'   bootstrapped. "non-parametric": resample with replacement from the original
#'   fitting data, refit the model, and make new predictions. "parametric":
#'   resample the model coefficients based on the covariance matrix originally
#'   estimated for those coefficients, then make new predictions.
#' @param rho The first-order autocorrelation coefficient to assume in
#'   simulateSolute(regression.model, interpolation.data). If missing, rho will
#'   be estimated from the interpolation data, but be warned that many data
#'   points are needed to make this estimation with precision.
#' @export
#' @family estimateMSE
estimateMSE.loadComp <- function(load.model, n.iter=100, method="parametric", rho, ...) {

  # Pull out the resid.model for easier access
  resid.model <- load.model@fit@resid.model

  # Use the interpolation data for simulation, observation, and calculation of new residuals
  interp_data <- resid.model@data

  # Observe values from the same dataset
  interp_obs <- observeSolute(
    data=interp_data, # use the interpolation data, this time as a source of observations
    flux.or.conc=resid.model@pred.format, # use the interpolation model's interp.format format
    metadata=load.model@fit@reg.model@metadata, # use metadata from the regression model to access columns relevant to that model (i.e., not Resid, but the real flux/conc)
    calculate=NA, # only calculate if we need to
    attach.units=FALSE) # save time by skipping the units

  # Put the observations into log space if necessary for calculating residuals
  # and final errors
  if(load.model@fit@log.resids) {
    interp_obs_resids <- log(interp_obs)
  } else {
    interp_obs_resids <- interp_obs
  }

  # Pre-calculate the conversion factors to make preds and obs in "conc" or "flux" format. Calls to formatPreds are expensive, so do them infrequently.
  is_flux_format <- resid.model@pred.format == "flux"
  if(is_flux_format) {
    conc_factor <- formatPreds(rep(1, nrow(resid.model@data)), from.format=resid.model@pred.format,
                               to.format="conc", newdata=resid.model@data, metadata=resid.model@metadata, attach.units=FALSE)
  } else {
    flux_factor <- formatPreds(rep(1, nrow(resid.model@data)), from.format=resid.model@pred.format,
                               to.format="flux", newdata=resid.model@data, metadata=resid.model@metadata, attach.units=FALSE)
  }

  # Loop, each time resampling coefficients and calculating the leave-n-out
  # (probably LOO) uncertainty for the interpolation.
  complete_MSE <- do.call(
    "rbind", #dimnames=list(NULL, c("conc","flux")),
    lapply(1:n.iter, function(i) {

      # Simulate values from the regression, putting them into interpolation format (flux or conc) and linear space
      sim_preds <- simulateSolute(
        load.model@fit@reg.model, flux.or.conc=resid.model@pred.format,
        newdata=interp_data, method=method, from.interval="confidence", if(!missing(rho)) rho)

      # Calculate residuals. How we calculate them should depend on the load.model
      # options. This code is almost identical to a chunk in getResiduals but
      # repeated here because I don't see a need to generalize getResiduals enough
      # to handle this composite-method-specific simulation/LOOCV process.
      if(load.model@fit@log.resids) {
        sim_preds <- log(sim_preds)
      }
      sim_resids <- switch(
        load.model@fit@abs.or.rel.resids,
        "absolute" = interp_obs_resids - sim_preds,
        "relative" = interp_obs_resids / sim_preds)

      # Update the interpolation model with the new residuals
      interp_refit <- load.model@fit@resid.model
      # This method of updating interp_refit, going straight to the guts, is a
      # little risky. It rests on the assumption that loadComp models always will
      # pass loadInterps the data in the interpolation format, such that
      # loadInterp will never have adjusted the values of y.in during the
      # loadInterp() call. For now, though, that's true. And it saves us the time
      # cost of reconverting dates, creating a new S4 object from scratch, and so
      # on.
      interp_refit@fit@y.in <- sim_resids

      ### Get the MSE of the new interpolation model. Use LOOCV (delete-one
      ### jackknife): leave just one out at a time, iterating through all the
      ### elements of interp_data

      # Pull some model pieces from the model for quick access
      interpfun <- resid.model@fit@interp.function # the innermost interpolation function, e.g., linearInterpolation
      dates.numeric <- resid.model@fit@dates.in # dates.in == dates.out for MSE estimation; already converted in loadInterp() to numeric(POSIXct()).
      y.obs <- resid.model@fit@y.in

      # Leaving one observation out at a time, do the interpolation repeatedly.
      # Go straight to the inner interpfun because we've needed to "refit" the
      # interpolation model with new y.obs values but we'd rather not reconvert
      # the dates all over again
      resid_preds <- sapply(1:length(y.obs), function(left.out) {
        interpfun(dates.numeric[-left.out], y.obs[-left.out], dates.numeric[left.out])
      })

      # resid_preds may be logged or not; sim_preds has already been converted
      # to log space to match. resid_preds may be absolute or relative; the
      # combination function (+ or *) changes accordingly.
      preds <- switch(
        load.model@fit@abs.or.rel.resids,
        "absolute"=sim_preds + resid_preds,
        "relative"=sim_preds * resid_preds)

      # Compute and package the residuals for return as a chunk in the
      # completeMSE sapply loop. Compute the residuals in linear space when
      # log.resids is FALSE - otherwise, we'll sometimes run into problems with
      # negative predictions, whose log is NaN.
      if(!load.model@fit@log.resids) {
        # When !log.resids, preds and interp_obs_resids are already in linear
        # space. So we're all ready to calculate residuals, making conversions
        # to conc or flux as needed
        conc_resids <- (interp_obs_resids - preds) * if(is_flux_format) conc_factor else 1
        flux_resids <- (interp_obs_resids - preds) * if(is_flux_format) 1 else flux_factor
        # Calculate & return the squared errors for this LOOCV run. It's more
        # computationally intensive, but quicker to program, to do as we're
        # doing here: return all the raw values, one column per format, rather
        # than computing an intermediate summary statistic.
        cbind(conc_resids^2, flux_resids^2)

      } else {
        # When !log.resids, preds and interp_obs_resids are already in linear
        # space. So we're all ready to calculate residuals. Residuals are the
        # same for flux and conc in log space - astonishing but true!
        # log(A*C)-log(B*C) = log(A)+log(C)-log(B)-log(C) = log(A)-log(B) with
        # no C!
        resids2 <- (interp_obs_resids - preds)^2
        cbind(resids2, resids2)
      }

    }))

  # Do some error checking so users know if their MSE is coming out with NAs
  if(any(is.na(complete_MSE))) {
    warning('getting NAs when calculating composite model error; check for NA or negative values in the input data')
  }

  # Return the distribution of the MSE from all those leave-n-out interations
  # in a 2x2 matrix. The errors have have been calculated are already in the
  # right log/linear space and flux/conc format, so our only remaining job is
  # to summarize their distribution.
  MSEs <- rbind(mean=apply(complete_MSE, 2, mean), sd=apply(complete_MSE, 2, sd))
  colnames(MSEs) <- c("conc","flux")
  MSEs
}

#' Extract model summary statistics from a loadComp model
#'
#' Produce a 1-row data.frame of model metrics. The relevant metrics for
#' loadComp models include two sets of statistics about autocorrelation (one for
#' the regression residuals, one for the 'residuals' used to do the composite
#' correction).
#'
#' @md
#' @inheritParams summarizeModel
#' @param newdata data.frame of data that was/will be used to predict
#'   concentration or load; should be the same as the \code{newdata} argument to
#'   \code{predictSolute}, e.g. a data.frame of daily or instantaneous dates and
#'   discharges.
#' @param irregular.timesteps.ok logical. If FALSE, this function requires that
#'   the timesteps between observations are identical to one another, and a plot
#'   is generated and an error is thrown if this requirement is not met. If
#'   TRUE, the check is not performed. If NA (the default), the check is
#'   performed but the function proceeds with a warning and no plot if the
#'   timesteps are found to be irregular. Tests and estimates of autocorrelation
#'   are weak to wrong when timesteps are irregular, but timesteps are often at
#'   least a bit irregular in the real world.
#' @return Returns a 1-row data frame with the following columns:
#'
#'   * `site.id` - the unique identifier of the site, as in [metadata()]
#'
#'   * `constituent` - the unique identifier of the constituent, as in
#'   [metadata()]
#'
#'   * `RMSE.lin` or `RMSE.log` - the square root of the mean squared error, in
#'   log space (`RMSE.log`) if the `use.log` equalled `TRUE` in the call to
#'   [loadComp()] that created this model.
#'
#'   * `reg.durbin.watson` - the Durbin Watson test statistic as applied to the
#'   residuals from the regression model fitting process
#'
#'   * `reg.rho` - the autocorrelation coefficient of the residuals from the
#'   regression model fitting process
#'
#'   * `int.durbin.watson` - the Durbin Watson test statistic as applied to the
#'   residuals from the interpolation model fitting process. See
#'   [residDurbinWatson()]
#'
#'   * `int.rho` - the autocorrelation coefficient of the residuals from the
#'   interpolation model fitting process. See [estimateRho()]
#'
#'   * `correction.frac` - the correction fraction, i.e., the fraction of total
#'   concentration or flux prediction that is attributable to a correction such
#'   as the residuals correction of composite method models. See
#'   [getCorrectionFraction()]
#' @importFrom dplyr select everything
#' @export
#' @family summarizeModel
summarizeModel.loadComp <- function(
  load.model, newdata, irregular.timesteps.ok=NA, ...) {

  # prepare args we'll use a few times below
  resid.args <- list(
    load.model=getFittedModel(load.model)@reg.model,
    flux.or.conc=load.model@pred.format,
    abs.or.rel.resids=getFittedModel(load.model)@abs.or.rel.resids,
    use.log=getFittedModel(load.model)@log.resids,
    plot.acf=FALSE, irregular.timesteps.ok=irregular.timesteps.ok)
  resid.data <- getFittingData(getFittedModel(load.model)@resid.model)

  # create a data.frame of model metrics
  out <- NextMethod(load.model, ...) # site.id, constituent, etc.
  RMSE <- if(0 %in% dim(load.model@MSE)) {
    NA
  } else {
    sqrt(load.model@MSE["mean", resid.args$flux.or.conc])
  }
  if(resid.args$use.log) out$RMSE.log <- RMSE else out$RMSE.lin <- RMSE
  out$reg.durbin.watson <- do.call(residDurbinWatson, resid.args)
  resid.args$irregular.timesteps.ok <- TRUE # we checked on the first call but will now skip to avoid replicate warnings
  out$reg.rho <- do.call(estimateRho, resid.args)$rho
  out$int.durbin.watson <- do.call(residDurbinWatson, c(list(newdata=resid.data), resid.args))
  out$int.rho <- do.call(estimateRho, c(list(newdata=resid.data), resid.args))$rho
  out$correction.frac <- getCorrectionFraction(load.model, flux.or.conc=resid.args$flux.or.conc, newdata=newdata)

  # return
  return(out)
}

#' The fraction of prediction that is due to a correction.
#'
#' Computes the fraction of total concentration or flux prediction that is
#' attributable to a correction such as the residuals correction of composite
#' method models.
#'
#' @name getCorrectionFraction
#' @rdname getCorrectionFraction
#' @param load.model The load model from which to pull the correction fraction.
#' @param flux.or.conc character. Should the metric be computed with respect to
#'   predictions of flux rate or concentration?
#' @param newdata The data for which to compute the correction fraction, usually
#'   containing daily or instantaneous discharge.
#' @param ... Other arguments passed to class-specific implementations of
#'   getCorrectionFraction.
#' @return A value between 0 and 1 indicating the fraction of total load that is
#'   attributed to a correction.
#' @export
#'
#' @family diagnostics
getCorrectionFraction <- function(load.model, flux.or.conc=c("flux","conc"), newdata, ...) {
  UseMethod("getCorrectionFraction")
}

#' @description
#'
#' \code{\link{loadComp}} models: The correction fraction for a composite method
#' (\code{loadComp}) model is defined here as:
#'
#' \deqn{CF = (\sum |R| \Delta t) / (\sum L \Delta t)}
#'
#' where \eqn{\sum} indicates a sum over all predictions, \eqn{L} is the vector
#' of composite predictions of flux or concentration, \eqn{R} is the vector of
#' interpolated residuals in the same format (flux or concentration, non-log
#' space) as \eqn{L}, and \eqn{\Delta t} is the vector of time periods
#' represented by the predictions.
#'
#' @rdname getCorrectionFraction
#' @importFrom stats complete.cases
#' @inheritParams getCorrectionFraction
#' @param na.rm logical. Should predictions with NA values be excluded?
#' @export
getCorrectionFraction.loadComp <- function(load.model, flux.or.conc=c("flux","conc"), newdata, na.rm=FALSE, ...) {
  if(missing(newdata)) stop("required argument to getCorrectionFraction: newdata")
  preds <- predictSolute(load.model, flux.or.conc, newdata, fit.reg=TRUE, fit.resid=TRUE, date=TRUE)
  if(isTRUE(na.rm)) {
    preds <- preds[complete.cases(preds), ]
  }
  R <- preds$fit.resid
  L <- preds$fit
  #dt involves explicit and/or implicit conversions to numeric; fine as long as
  #dates are in a date format with a decent as.numeric implementation (includes
  #Date, POSIXt, chron)
  dt <- as.numeric(c(diff(preds$date), mean(diff(preds$date))))

  sum(abs(R*dt))/sum(L*dt)
}
USGS-R/loadflex documentation built on July 26, 2023, 9:54 p.m.