#### loadLm class ####
#' A load model class specific to simple linear models (\code{\link{lm}}s) for
#' flux estimation.
#'
#' loadLms can take any lm formula.
#'
#' @slot fit the interpolation model to be used.
#' @slot ylog logical. If TRUE, this constitutes affirmation that the values
#' passed to the left-hand side of the model formula will be in log space. If
#' missing, the value of \code{ylog} will be inferred from the values of
#' \code{formula} and \code{y.trans.function}, but be warned that this
#' inference is fallible.
#' @importFrom methods setClass
#' @exportClass loadLm
#' @family load.model.classes
setClass(
"loadLm",
slots=c(
fit="lm",
ylog="logical"),
contains="loadModel",
# Validity of parent objects is checked first, automatically
validity=function(object) {
errorstrs <- character()
validObject(object@fit)
# Return
if(length(errorstrs) == 0) {
TRUE
} else {
errorstrs
}
}
)
#' Create a fitted loadLm object.
#'
#' Generates a new model of class loadInterp (\code{\link{loadLm-class}}) which
#' can connect observations of concentration or flux using a linear regression
#' model.
#'
#' @importFrom methods new
#' @importFrom stats lm
#' @param formula A formula specifying the linear model to fit.
#' @param pred.format In what format (flux or conc) does the lm object make
#' predictions when predict.lm() is called?
#' @param data data.frame. The data to be interpolated
#' @param metadata metadata, used to access the appropriate columns of data. At
#' a minimum, \code{metadata} should correctly specify the date column.
#' @param retrans.function function that accepts a vector of predictions
#' straight from the fitted model and retransforms them, if appropriate, to
#' predictions in linear space and with units of concentration*flow. Because
#' load models are frequently fit to log(y) ~ ..., the default is
#' retrans.function=exp. After retrans.function has been applied to the
#' predictions, the retransformed predictions will automatically undergo
#' further units conversion from conc*flow to load.rate, according to the
#' metadata elements conc.units, flow.units, and load.rate.units.
#' @param store One or more character strings specifying which information to
#' write within the model. Options are 'data': the original fitting data;
#' 'fitting.function': a fitting function that can produce a new loadComp
#' object from new data
#' @param fitting_function a fitting function that can produce a new loadComp
#' object from new data (this currently uses the same new data for both
#' regression calibration and interpolation)
#' @param y.trans.function function that accepts a vector of predictions
#' straight from the fitted model and retransforms them, if appropriate, to
#' predictions in linear space and with units of concentration*flow. Because
#' load models are frequently fit to log(y) ~ ..., the default is
#' retrans.function=exp. After retrans.function has been applied to the
#' predictions, the retransformed predictions will automatically undergo
#' further units conversion from conc*flow to load.rate, according to the
#' metadata elements conc.units, flow.units, and load.rate.units.
#' @param ylog logical. If TRUE, this constitutes affirmation that the values
#' passed to the left-hand side of the model formula will be in log space. If
#' missing, the value of \code{ylog} will be inferred from the values of
#' \code{formula} and \code{y.trans.function}, but be warned that this
#' inference is fallible.
#' @return A fitted loadLm model.
#' @export
#' @family load.model.inits
#' @examples
#' data(eg_fitdat, eg_metadata)
#' no3_lm <- loadLm(formula=log(NO3) ~ log(DISCHARGE) + DATE, pred.format="conc",
#' data=eg_fitdat, metadata=eg_metadata, retrans=exp)
loadLm <- function(formula, pred.format=c("flux","conc"),
data, metadata, fitting_function=NULL,
y.trans.function=NULL, retrans.function=exp,
store=c("data","fitting.function"), ylog) {
# Validate arguments
pred.format <- match.arg.loadflex(pred.format)
store <- match.arg.loadflex(store, choices=c("data","fitting.function"), several.ok=TRUE)
if(missing(ylog)) {
# Give the benefit of the doubt until there's no confirmation of logged ys
ylog <- TRUE
# Check that y.trans.function or the formula suggest a log transformation
if(!is.null(y.trans.function)) {
if(!all.equal(y.trans.function, log)) {
warning("y.trans.function should usually be NULL (if the formula has a log) or log")
ylog <- FALSE
}
} else if(!any(as.list(as.list(formula)[[2]]) == as.symbol("log"))) {
warning("formula should contain a log on the LHS unless y.trans.function is log")
ylog <- FALSE
}
# Check that retrans.function == exp
if(is.null(retrans.function)) {
warning("Currently, retrans.function should be exp for loadLm models")
ylog <- FALSE
} else if(!all.equal(retrans.function, exp)) {
warning("Currently, retrans.function should be exp for loadLm models")
ylog <- FALSE
}
}
# If requested, generate fitting function
if("fitting.function" %in% store) {
fitting_function <- function(training.data, store=c()) {
loadLm(formula=formula, pred.format=pred.format,
data=training.data, metadata=metadata,
fitting_function=fitting_function, retrans.function=retrans.function,
store=store, ylog=ylog)
}
}
# Create the model
load.model <- new(
"loadLm",
fit=lm(formula, data=data),
pred.format=pred.format,
metadata=metadata,
data=data,
fitting.function=if("fitting.function" %in% store) fitting_function else NULL,
retrans.function=retrans.function,
ylog=ylog)
load.model
}
#' Make flux or concentration predictions from a loadLm model.
#'
#' Makes instantaneous predictions (at the temporal resolution of
#' \code{newdata}) from a fitted \code{\link{loadLm}} model. See
#' \code{\link{predictSolute}} for details.
#'
#' @importFrom stats qnorm qt
#' @inheritParams predictSolute
#' @param load.model A loadLm object.
#' @param newdata \code{data.frame}, optional. Predictor data. Column names
#' should match those given in the \code{loadLm} metadata. If
#' \code{newdata} is not supplied, the original fitting data will be used.
#' @return A vector of data.frame of predictions, as for the generic
#' \code{\link{predictSolute}}.
#' @export
#' @family predictSolute
predictSolute.loadLm <- function(load.model, flux.or.conc=c("flux","conc"), newdata,
interval=c("none","confidence","prediction"), level=0.95,
lin.or.log=c("linear","log"), se.fit=FALSE, se.pred=FALSE,
date=FALSE, attach.units=FALSE,
agg.by=c("unit", "day", "month", "water year",
"calendar year", "total", "mean water year",
"mean calendar year", "[custom]"), ...) {
# Validate arguments
flux.or.conc <- match.arg.loadflex(flux.or.conc)
interval <- match.arg.loadflex(interval)
attach.units <- match.arg.loadflex(attach.units)
lin.or.log <- match.arg.loadflex(lin.or.log)
agg.by <- match.arg(agg.by)
# Check the model - can we confirm that y is logged?
if(!load.model@ylog) {
stop("For accurate prediction, y values must have been logged during model fitting")
}
# If there's no newdata, use the data
if(missing(newdata)) {
newdata <- load.model@data
}
# Get the direct model predictions from the fit object, adding intervals if
# requested and getting se.fit if either se.fit or se.pred is requested. (But
# at least while loadLm requires logged left-hand sides, we will always need
# se.fit to retransform the mean.)
preds_log <- predict(load.model@fit, newdata=newdata, interval=interval, se.fit=TRUE)
# Take whatever format we got and put it into a data.frame
if(is.matrix(preds_log)) {
preds_log <- as.data.frame(preds_log) # e.g., interval="confidence" & se.fit=FALSE
} else if(is.list(preds_log)) {
if(is.matrix(preds_log[[1]])) {
preds_log[[1]] <- as.data.frame(preds_log[[1]])
preds_log <- data.frame(preds_log$fit, se.fit=preds_log$se.fit)
} else {
preds_log <- data.frame(fit=preds_log$fit, se.fit=preds_log$se.fit)
}
} else {
preds_log <- data.frame(fit=preds_log)
}
# Calculate se.pred if needed. At least while loadLm only permits logged
# left-hand sides, we will always need se.pred
if(TRUE | se.pred) {
# This calculation of se.pred is consistent with the predict.lm code.
# Lines pulled & slightly modified from predict.lm:
# object <- load.model@fit
# p <- object$rank
# p1 <- seq_len(p)
# piv <- if (p) stats:::qr.lm(object)$pivot[p1]
# res.var <- sum(object$residuals^2)/object$df.residual
# # calculate ip
# X <- model.matrix(object)
# XRinv <- X[, piv] %*% qr.solve(qr.R(stats:::qr.lm(object))[p1, p1])
# ip <- drop(XRinv^2 %*% rep(res.var, p))
# # calculate pred.var
# weights <- 1
# pred.var <- res.var/weights
# # construct intervals
# level = 0.95
# tfrac <- qt((1 - level)/2, object$df.residual)
# hwid <- tfrac * switch(
# interval,
# confidence = sqrt(ip),
# prediction = sqrt(ip+pred.var))
# predictor <- cbind(preds$fit, preds$fit + hwid %o% c(1, -1))
# colnames(predictor) <- c("fit", "lwr", "upr")
# therefore, ip = SE_mean and ip+pred.var=SE_pred.
# Calculate se.pred^2 == se.fit^2 + pred.var, where pred.var == residual
# mean squared error (res.var/df) divided by weights, where we'll
# assume/require for loadLm that weights are all 1.
res.var <- sum(load.model@fit$residuals^2)/load.model@fit$df.residual
if(!any(is.null(load.model@fit$weights))) stop("Weights must be null for now. Sorry.")
weights <- 1
pred.var <- res.var/weights
# preds_log is always a data.frame by now, so just add a column
preds_log$se.pred <- sqrt(preds_log$se.fit^2+pred.var)
}
# Apply the retransformation (which we currently require to be exp) to create
# preds_lin from preds_log
# Only perform retransformation if linear is desired
if(lin.or.log == "log") {
preds <- preds_log
} else {
if(!all.equal(load.model@retrans.function, exp)) {
# For now, restrict this function to loadLms where retrans.function == exp,
# i.e., the model is fit to log-transformed y values.
stop("predictSolute.loadLm currently only implemented for models where retrans.function == exp")
}
# This will be a standard conversion from SD of a lognormally distributed
# statistic to SD of a normally distributed one. If mu and sigma are the
# moments of the log(X) values (preds_log$fit and preds_log$se.fit (or se.pred?),
# respectively), then back in linear space the moments of X are
# preds_lin$fit=m=exp(mu+sigma^2/2) and preds_lin$se.fit^2=v=(exp(sigma^2) - 1)*exp(2*mu+sigma^2).
# The mean: always use se.pred (rather than se.fit) to calculate the mean in
# linear space. logToLin is one way of doing a "bias correction", sensu Cohn
# 2005: it returns a meanlin of exp(meanlog+sdlog^2/2) instead of
# exp(meanlog). Other options would be to use the jackknife, smearing, or
# Finney's MVUE estimators; of those, MVUE has been repeatedly advocated by
# Cohn and others, and something close to MVUE is encoded in rloadest.
meansd_lin <- logToLin(meanlog=preds_log$fit, sdlog=preds_log$se.pred)
preds_lin <- data.frame(fit=meansd_lin$meanlin)
# The intervals: use a t distribution in log space, then reconvert to linear
# space. We could also use qlnorm, but that would use a normal rather than t
# in log space, and we do know the number of degrees of freedom. Regardless,
# this will be quite close to the output of qlnorm(p=0.5+c(-1,1)*level/2,
# meanlog=preds_log$fit, sdlog=preds_log$se.pred).
if(interval != "none") {
se_ci <- switch(interval, confidence=preds_log$se.fit, prediction=preds_log$se.pred)
DF <- load.model@fit$df.residual
ci_quantiles <- qt(p=0.5+c(-1,1)*level/2, DF)
preds_lin$lwr <- exp(preds_log$fit + ci_quantiles[1]*se_ci)
preds_lin$upr <- exp(preds_log$fit + ci_quantiles[2]*se_ci)
}
# The SEs:
if(se.fit) {
preds_lin$se.fit <- logToLin(meanlog=preds_log$fit, sdlog=preds_log$se.fit)$sdlin
}
if(se.pred) {
preds_lin$se.pred <- meansd_lin$sdlin
}
preds <- preds_lin
} # end log to lin transformation
# Change flux/conc formats if appropriate
preds <- formatPreds(preds, from.format=load.model@pred.format, to.format=flux.or.conc,
newdata=newdata, metadata=load.model@metadata, lin.or.log=lin.or.log,
attach.units=attach.units)
# Add dates if requested
if(date) {
if(!is.data.frame(preds)) {
preds <- data.frame(fit=preds)
}
# prepend the date column
preds <- data.frame(date=getCol(load.model@metadata, newdata, "date"), preds)
}
# If it's just the central predictions, return them as a vector rather than a data.frame
if(ncol(preds) == 1) {
preds <- preds$fit
}
#use aggregate solute to aggregate to agg.by, but warn and return NA for uncertainty
if(agg.by != "unit") {
preds <- aggregateSolute(preds, metadata = getMetadata(load.model), agg.by = agg.by,
format = flux.or.conc, dates = getCol(load.model@metadata, newdata, "date"))
}
return(preds)
}
#' Resample the coefficients of a linear model (lm)
#'
#' Returns a new linear model given their original covariance and uncertainty
#'
#' (Although the name suggests otherwise, resampleCoefficients is not currently
#' an S3 generic. You should refer to this function by its complete name.)
#'
#' @importFrom stats coef rchisq
#' @importFrom MASS mvrnorm
#' @param fit an lm object whose coefficients should be resampled
#' @return A new lm object with resampled coefficients such that predict.lm()
#' will make predictions reflecting those new coefficients. No other
#' properties of the returned model are guaranteed.
#' @export
#' @references
#' http://www.clayford.net/statistics/simulation-to-represent-uncertainty-in-regression-coefficients/
resampleCoefficients.lm <- function(fit) {
# Extract useful info from the model. summary(fit)$cov.unscaled is equal to
# solve(crossprod(model.matrix(fit))), which looks like it could be more
# direct and therefore faster, but I checked, using
# autoplot(microbenchmark(summary(fit)$cov.unscaled,
# solve(crossprod(model.matrix(fit))))): the summary() method is faster.
coefs <- coef(fit) # coefficients
cov.unscaled <- summary(fit)$cov.unscaled # unscaled covariance matrix.
s.hat <- summary(fit)$sigma # residual standard error
n.minus.k <- summary(fit)$df[2] # n - k
# Simulate residual standard deviation as s.hat*sqrt(df/X) where s.hat is the
# estimate of sigma, df=n.minus.k is the degrees of freedom, and X "is a
# random draw from the chi-squared distribution with n-k degrees of freedom"
# (www.clayford.net)
s.hat.sim <- s.hat*sqrt(n.minus.k/rchisq(1, n.minus.k))
# Simulate regression coefficients
coef.resim <- mvrnorm(n=1, mu=coefs, Sigma=s.hat.sim^2*cov.unscaled)
# Put the new coefficients into the model. It's $coefficients, not $coef, that
# seems to matter for predict.lm(), so just do that one for efficiency's sake.
fit$coefficients <- coef.resim
# Return the lm object
fit
}
#' Produce a set of predictions that reflect the coefficient uncertainty and
#' natural variation.
#'
#' This function resamples the coefficients from their joint distribution, then
#' makes predictions whose individual errors are sampled from a time series with
#' the same first-order autocorrelation as the original series of errors.
#'
#' @importFrom stats arima.sim sd
#' @inheritParams simulateSolute
#' @param load.model A loadLm object.
#' @param newdata \code{data.frame}, optional. Predictor data. Column names
#' should match those given in the \code{loadLm} metadata. If
#' \code{newdata} is not supplied, the original fitting data will be used.
#' @return A vector of data.frame of predictions, as for the generic
#' \code{\link{predictSolute}}.
#' @return A vector of predictions that are distributed according to the
#' uncertainty of the coefficients and the estimated natural variability +
#' measurement error.
#' @export
#' @family simulateSolute
simulateSolute.loadLm <- function(load.model, flux.or.conc=c("flux","conc"), newdata,
method=c("parametric", "non-parametric"), from.interval=c("confidence", "prediction"), rho, ...) {
# Validate arguments
flux.or.conc <- match.arg.loadflex(flux.or.conc)
from.interval <- match.arg.loadflex(from.interval, c("confidence","prediction"))
method <- match.arg.loadflex(method, c("parametric", "non-parametric"))
if(missing(newdata)) newdata <- getFittingData(load.model) # need to do this for predictSolute call in case non-parametric method overwrites load.model@data
# Generate bootstrap model
if(method=="parametric") {
# Adjust the load.model@fit to contain the new (resampled) coefficients.
load.model@fit <- resampleCoefficients.lm(load.model@fit)
} else if(method=="non-parametric") {
# Refit the model to new (resampled) data - this probably takes longer but has nice statistical properties
df <- getFittingData(load.model)
load.model <- getFittingFunction(load.model)(
training.data=df[sample.int(nrow(df), replace=TRUE), ],
store=c())
}
# Make predictions for the new data
fitting.preds <- predictSolute(load.model, flux.or.conc=flux.or.conc, newdata=newdata, interval="none")
## Resample Errors ##
# EXPERIMENTAL: Add noise to the predictions with an autocorrelation structure
# based on that of the residuals (or supplied by the caller). To keep the
# autocorrelation that's necessary for loadComp to make sense (even though
# it's assumed not to exist in the calibration data), either (1) estimate
# autocorrelation from the calibration data, or (2) accept an estimate of
# autocorrelation from the user. Then use that autocorrelation and the
# residual standard deviation to generate residuals.
if(from.interval == "prediction") {
if(missing(rho)) {
rho.out <- estimateRho(load.model, flux.or.conc=flux.or.conc, abs.or.rel.resids="absolute", use.log=TRUE,
newdata=newdata, irregular.timesteps.ok=TRUE, plot.acf=FALSE)
rho <- rho.out$rho # implies that the time.step for newata is the one we want
}
# Simulate the residuals so that they have the autocorrelation structure
# that was estimated from the original residuals or supplied by the caller.
# Not sure whether rho takes care of matching the distribution of resampled
# residuals to the original; it seems not to.
noise <- as.numeric(arima.sim(model=list(order=c(1,0,0), ar=rho), n=nrow(newdata)))
# Normalize the noise to have the same sd as the original residuals. Is that reasonable?
s.hat <- summary(load.model@fit)$sigma
noise <- noise * s.hat/sd(noise)
# Add the noise back into the predictions, first converting back to linear space
fitting.preds <- exp(log(fitting.preds) + noise)
}
return(fitting.preds)
}
#' Extract model summary statistics from a loadLm model
#'
#' Produce a 1-row data.frame of model metrics.
#'
#' @md
#' @inheritParams summarizeModel
#' @return Returns a 1-row data frame with the following columns:
#'
#' * `site.id` - the unique identifier of the site, as in [metadata()]
#'
#' * `constituent` - the unique identifier of the constituent, as in
#' [metadata()]
#'
#' * `eqn` - the regression equation, possibly in the form `const ~ model(x)`
#' where `x` is the `Number` of a pre-defined equation in [rloadest::Models]
#'
#' * `RMSE` - the square root of the mean squared error. Errors will be
#' computed from either fluxes or concentrations, as determined by the value
#' of `pred.format` that was passed to [loadReg2()] when this model was
#' created
#'
#' * `r.squared` - the r-squared value, generalized for censored data,
#' describing the amount of observed variation explained by the model
#'
#' * `p.value` - the p-value for the overall model fit
#'
#' * `cor.resid` - the serial correlation of the model residuals
#'
#' * `PPCC` - the probability plot correlation coefficient measuring the
#' normality of the residuals
#'
#' * `Intercept`, `lnQ`, `lnQ2`, `DECTIME`, `DECTIME2`, `sin.DECTIME`,
#' `cos.DECTIME`, etc. - the fitted value of the intercept and other terms
#' included in this model (list differs by model equation)
#'
#' * `Intercept.SE`, `lnQ.SE`, `lnQ2.SE`, `DECTIME.SE`, `DECTIME2.SE`,
#' `sin.DECTIME.SE`, `cos.DECTIME.SE`, etc. - the standard error of the fitted
#' estimates of these terms
#'
#' * `Intercept.p.value`, `lnQ.p.value`, `lnQ2.p.value`, `DECTIME.p.value`,
#' `DECTIME2.p.value`, `sin.DECTIME.p.value`, `cos.DECTIME.p.value` - the
#' p-values for each of these model terms
#'
#' @return A 1-row data.frame of model metrics
#' @importFrom dplyr select everything
#' @export
#' @family summarizeModel
#' @examples
#' data(eg_loadLm)
#' summarizeModel(eg_loadLm)
summarizeModel.loadLm <- function(load.model, ...) {
# collect the default summary
out <- NextMethod()
# add coefs and other overall statistics, maintaining a single 1-row data.frame
fit <- getFittedModel(load.model)
sumfit <- summary(fit)
out$eqn <- capture.output({
ft <- fit$terms
attributes(ft) <- NULL
print(ft)
})
out$RMSE = smwrStats::rmse(fit)
out$r.squared = sumfit$r.squared
out$p.value = stats::pf(sumfit$fstatistic[[1]], sumfit$fstatistic[[2]], sumfit$fstatistic[[3]], lower.tail=FALSE)
out$cor.resid = cor(sumfit$residuals[-1], sumfit$residuals[-length(sumfit$residuals)])
out$PPCC = smwrStats::ppcc.test(fit$residuals)$statistic[['r']]
# summarize the coefficient estimates, standard errors, and p-values
coefs <- coef(summary(fit))
coefsDF <- setNames(
data.frame(
t(coefs[,'Estimate']),
SE=t(coefs[,'Std. Error']),
pval=t(coefs[,'Pr(>|t|)'])
),
nm = paste0(
rep(gsub('(Intercept)', 'Intercept', row.names(coefs), fixed=TRUE), 3), # coefficient names
rep(c('', '.SE', '.p.value'), each=nrow(coefs))) # aspect of coefficient being described
)
out <- data.frame(out, coefsDF, check.names=FALSE, stringsAsFactors=FALSE)
# return
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.