#' Predict Loads
#'
#' Estimate loads from a rating-curve model from \code{loadReg}
#'for a new data frame, aggregating the loads by specified time
#'periods.
#'
#' The time frame specified by \code{by} can be "unit,"
#'"day," "month," "water year," "calendar year," "total," or
#'the name of a column in \code{newdata} that can be used to
#'group the data.
#'
#' If \code{allow.incomplete} is \code{TRUE}, then loads will be
#'computed based on all nonmissing values, otherwise missing values
#'\code{NAs} will be returned. For this application, missing values
#'includes \code{NAs} and gaps in the record, except for \code{by}
#'set to "total" or user defined groups where missing values only
#'includes \code{NAs}. For prediction by "day" when there are variable
#'number of unit values per day, \code{allow.incomplete} must be
#'set to \code{TRUE}.
#'
#' The term confidence interval is used here as in the original
#'documentation for LOADEST, but the values that are reported are
#'the prediction intervals, computed from the SEP.
#'
#' @param fit the output from \code{loadReg}.
#' @param newdata a data frame of the prediction variables. Missing values
#'are not permitted in any column in \code{newdata}. Observations with
#'missing values \code{NAs} must be removed before prediction. Columns that
#'are not needed for prediction that contain missing values can be removed
#'before removing all rows with missing values. The maximum number of rows
#'permitted in \code{newdata} is 176000.
#' @param load.units a character string indicating the units of the
#'predicted loads/fluxes. By default, uses the value specified in
#'\code{loadReg}. See \code{\link{loadReg}} for a complete list of options.
#' @param by the time frame for estimates. See \code{Details}.
#' @param seopt a character string indicating how to comute the
#'standard error of the aggregated load estimates, must be either
#'"exact" or "approximate." Only the first letter is necessary.
#' @param allow.incomplete compute loads for periods withing
#'missing values or incomplete record? See \code{Details}.
#' @param conf.int the confidence interval to compute for loads
#'computed by "day" or "unit." The confidence interval is fixed at
#'0.95 for any other value for \code{by}. See \code{Details}.
#' @param print print a report summary of the load estimate?
#' @return A data frame containing the load estimates.
#' @seealso \code{\link{loadReg}},
#' @examples
#'# From application 1 in the vignettes
#'data(app1.calib)
#'app1.lr <- loadReg(Phosphorus ~ model(1), data = app1.calib,
#' flow = "FLOW", dates = "DATES", conc.units="mg/L",
#' station="Illinois River at Marseilles, Ill.")
#'predLoad(app1.lr, app1.calib)
#' @useDynLib rloadest estlday
#' @useDynLib rloadest estltot
#' @export
predLoad <- function(fit, newdata, load.units=fit$load.units, by="total",
seopt="exact", allow.incomplete=FALSE,
conf.int=0.95, print=FALSE) {
## Coding history:
## 2013Jun06 DLLorenz Original Coding
## 2013Jul03 DLLorenz Added code to deal with unit values
## 2013Dec05 DLLorenz Bug fix
## 2014Sep23 DLLorenz Missing check on newdata
##
## By options and other preliminary code
if(nrow(newdata) > 176000L) {
stop("newdata has too many rows, the size limit is 176000")
}
ByOpt <- c("unit", "day", "month", "water year", "calendar year", "total")
match.arg(by, ByOpt)
load.units
seopt <- match.arg(seopt, c("exact", "approximate"))
Qadj <- fit$Qadj
Tadj <- fit$Tadj
flow <- fit$flow
dates <- fit$dates
model.no <- fit$model.no
time.step <- fit$time.step
gps.nday <- 1L
## Aggregate newdata by the time.step
if(class(newdata[[dates]])[1L] == "Date") {
if(time.step != "day")
stop("Time steps of ", time.step, "require POSIXt dates for the estimation data")
gps <- format(newdata[[dates]])
} else if(time.step != "instantaneous") {
## Construct grouping time steps of the unit value data
if(time.step == "day") {
## Extract the day part
gps <- format(newdata[[dates]], "%Y-%m-%d")
} else {
## Required to force hours to match local time rather than
## conversion to GMT, which offsets the local time
gps <- as.POSIXlt(as.character(newdata[[dates]]), tz="GMT")
N.gps <- length(gps)
gps$sec <- gps$min <- rep(0, N.gps)
## OK, now aggregate the hours
hts <- as.numeric(strsplit(time.step, split=" ")[[1L]][1L])
gps$hour <- (gps$hour %/% hts) * hts
gps <- format(gps, "%Y-%m-%d %H")
## Set the number of obs per day
gps.nday <- as.integer(24.0001/hts) # Just in case
}
## Aggregate newdata and reset timezone info
ntz <- attr(newdata[[dates]], "tzone")
newdata <- aggregate(newdata, list(time.step=gps), mean)
attr(newdata[[dates]], "tzone") <- ntz
} else { # Must be instantaneous
gps <- format(newdata[[dates]], format="%Y-%m-%d")
numdays <- length(unique(gps)) # the number of days in the datset
gps.nday <- round(nrow(newdata)/as.numeric(numdays), 6L) # should get to near integer
if((gps.nday %% 1) != 0 && !(by %in% c("unit", "day")))
stop("newdata must have the same number of observations each day")
gps.nday <- as.integer(gps.nday)
}
lcf <- loadUnitConv(fit$load.units, load.units)
if(model.no == 99L) {
Terms <- delete.response(fit$lfit$terms)
m <- model.frame(Terms, newdata, na.action = na.pass)
model.inp <- model.matrix(Terms, m)
} else {
model.inp <- setXLDat(newdata, flow, dates, Qadj, Tadj, model.no)
}
if(any(is.na(model.inp)))
stop("Prediction data contains missing values, remove before prediction")
## Construct the structure for aggregating the data
## Deal with byn == 0 directly in the last estimation section
checkDays <- list() # Create empty list for day checking
byn <- pmatch(by, ByOpt, nomatch=0L)
if(byn == 3L) { # by month
## Get the expected number of days for each month
mn <- month(newdata[[dates]])
yr <- year(newdata[[dates]])
dyinper <- daysInMonth(mn, yr)
## Create the categories
by <- "_by_"
Period <- paste(month.name[mn], yr, sep=" ")
newdata[[by]] <- factor(Period, levels=unique(Period)) # Force date order
checkDays <- tapply(dyinper, Period, function(x) x[1L])
} else if(byn == 4L) { # by water year
yr <- waterYear(newdata[[dates]], numeric=TRUE)
dyinper <- 365L + leap_year(yr)
by <- "_by_"
Period <- paste("WY", yr, sep=" ")
newdata[[by]] <- Period
checkDays <- tapply(dyinper, Period, function(x) x[1L])
} else if(byn == 5L) { # by calendar year
yr <- year(newdata[[dates]])
dyinper <- 365L + leap_year(yr)
by <- "_by_"
Period <- paste("CY", yr, sep=" ")
newdata[[by]] <- Period
checkDays <- tapply(dyinper, Period, function(x) x[1L])
} else if(byn == 0L) {
## No way to check the correct number of days a priori
## Check to see that it is in newdata
if(!(by %in% names(newdata)))
stop(by, " not found in the estimation dataset")
} else
by <- ByOpt[byn]
if(by == "unit") {
## Only 1 obs per unit
KDays <- seq(nrow(model.inp))
## Exclude NAs and presumably 0 flows
KDays <- KDays[is.finite(rowSums(model.inp))]
NDIM <- as.integer(length(KDays))
out.data <- .Fortran("estlday",
NPAR=fit$lfit$NPAR,
PARMLE=fit$lfit$PARMLE,
BIAS=fit$lfit$BIAS,
CV_IN=fit$lfit$CV,
SBIAS=fit$lfit$SBIAS,
SCV_IN=fit$lfit$SCV,
npred=as.integer(length(KDays)),
xlpred=model.inp[KDays,],
NDAYS=NDIM,
KDAY2=as.integer(KDays),
load=double(NDIM),
loadvar=double(NDIM),
loadlow=double(NDIM),
loadup=double(NDIM),
loadsep=double(NDIM),
IERR=integer(1L))
Flux <- Std.Err <- SEP <- L95 <- U95 <- rep(NA_real_, nrow(model.inp))
if(out.data$IERR > 0) {
warning(" *** Error (",out.data$IERR,") occurred in processing daily data. ***\n",sep="")
retval <- data.frame(Date=newdata[[dates]], Flow=Flow,
Flux=Flux, Std.Err=Std.Err, SEP=SEP,
L95=L95, U95=U95)
} else {
## OK, we've had a successful run, pack the data into a data frame
Flux[KDays] <- out.data$load * lcf
Std.Err[KDays] <- sqrt(out.data$loadvar) * lcf
SEP[KDays] <- out.data$loadsep * lcf
## Convert to correct conf.int
Names <- paste(c("L", "U"), round(conf.int*100, 0), sep="")
DF <- fit$lfit$NOBSC - fit$lfit$NPAR
B <- sqrt(log(1 + (SEP[KDays]/Flux[KDays])^2))
A <- log(Flux[KDays]) - 0.5*B^2
qci <- qt(1 - (1 - conf.int)/2, DF)
L95[KDays] <- exp(A - qci*B)
U95[KDays] <- exp(A + qci*B)
Flow <- newdata[[flow]]
## convert NAs to 0 for 0 flow
ZDays <- which(Flow == 0)
if(length(ZDays)) {
Flux[ZDays] <- 0
Std.Err[ZDays] <- 0
SEP[ZDays] <- 0
L95[ZDays] <- 0
U95[ZDays] <- 0
}
retval <- data.frame(Date=newdata[[dates]], Flow=Flow,
Flux=Flux, Std.Err=Std.Err, SEP=SEP,
L95=L95, U95=U95)
}
names(retval)[6L:7L] <- Names
} else if(by == "day") {
## One or more obs per day
## KDate are the dates of the days
## KDays are the indexes to days
## Kin are the indexes to the good model inputs
## Kdy are the indexes to the days of model inputs
## KinAll are the indexes to all days
##
## Preserve flow for later
Flow <- newdata[[flow]]
if(time.step == "day") {
KDate <- as.Date(as.POSIXlt(newdata[[dates]]))
KDays <- seq(nrow(model.inp))
KinAll <- KDays
## Exclude NAs and presumably 0 flows
KDays <- KDays[is.finite(rowSums(model.inp))]
Kin <- Kdy <- KDays
NDIM <- as.integer(length(KDays))
} else {
Kin <- seq(nrow(model.inp))
Kin <- Kin[is.finite(rowSums(model.inp))]
KDate <- as.Date(as.POSIXlt(newdata[[dates]]))
Kdy <- as.integer(KDate)
Ktbl <- table(Kdy)
if(length(unique(Ktbl)) > 1L && !allow.incomplete) {
warning("Variable observations per day, either set the allow.incomplete argument to TRUE or use the resampleUVdata function to construct a uniform series")
}
Kdy <- rep(seq(1, length(Ktbl)), Ktbl) # Create the index to days in input
KDate <- unique(KDate)
KinAll <- unique(Kdy)
## Make it daily flow, Flow0 indicates a partial 0 flow
Flow0 <- tapply(Flow, Kdy, min)
Flow <- tapply(Flow, Kdy, mean)
Flow0 <- ifelse(Flow == 0, 1, Flow0)
Kdy <- Kdy[Kin]
## Make a count of Kins for each day
Kintmp <- table(Kdy)
KinK <- rep(0L, along=KinAll)
KinK[as.integer(names(Kintmp))] <- Kintmp
## OK, process rest of indexes
KDays <- unique(Kdy)
NDIM <- as.integer(length(KDays))
}
out.data <- .Fortran("estlday",
NPAR=fit$lfit$NPAR,
PARMLE=fit$lfit$PARMLE,
BIAS=fit$lfit$BIAS,
CV_IN=fit$lfit$CV,
SBIAS=fit$lfit$SBIAS,
SCV_IN=fit$lfit$SCV,
npred=as.integer(length(Kdy)),
xlpred=model.inp[Kin,],
NDAYS=NDIM,
KDAY2=as.integer(Kdy),
load=double(NDIM),
loadvar=double(NDIM),
loadlow=double(NDIM),
loadup=double(NDIM),
loadsep=double(NDIM),
IERR=integer(1L))
Flux <- Std.Err <- SEP <- L95 <- U95 <- rep(NA_real_, length(KinAll))
if(out.data$IERR > 0) {
warning(" *** Error (",out.data$IERR,") occurred in processing daily data. ***\n",sep="")
retval <- data.frame(Date=KDate, Flow=Flow,
Flux=Flux, Std.Err=Std.Err, SEP=SEP,
L95=L95, U95=U95)
} else {
## OK, we've had a successful run, pack the data into a data frame
Flux[KDays] <- out.data$load * lcf
Std.Err[KDays] <- sqrt(out.data$loadvar) * lcf
SEP[KDays] <- out.data$loadsep * lcf
## Convert to correct conf.int
Names <- paste(c("L", "U"), round(conf.int*100, 0), sep="")
DF <- fit$lfit$NOBSC - fit$lfit$NPAR
B <- sqrt(log(1 + (SEP[KDays]/Flux[KDays])^2))
A <- log(Flux[KDays]) - 0.5*B^2
qci <- qt(1 - (1 - conf.int)/2, DF)
L95[KDays] <- exp(A - qci*B)
U95[KDays] <- exp(A + qci*B)
## Convert NAs to 0 for 0 flow
ZDays <- which(Flow == 0)
if(length(ZDays)) {
Flux[ZDays] <- 0
Std.Err[ZDays] <- 0
SEP[ZDays] <- 0
L95[ZDays] <- 0
U95[ZDays] <- 0
}
## Repair partial 0 flow days and process incomplete days
if(gps.nday > 1L)
for(i in KinAll) {
if(Flow0[i] == 0) {
## The correction factor to make it a true daily mean
CorrFact <- KinK[i]/gps.nday
Flux[i] <- Flux[i] * CorrFact
Std.Err[i] <- Std.Err[i] * CorrFact
SEP[i] <- SEP[i] * CorrFact
L95[i] <- L95[i] * CorrFact
U95[i] <- U95[i] * CorrFact
} else if(KinK[i] != gps.nday && !allow.incomplete) {
Flux[i] <- NA_real_
Std.Err[i] <- NA_real_
SEP[i] <- NA_real_
L95[i] <- NA_real_
U95[i] <- NA_real_
}
} # End of if too
retval <- data.frame(Date=KDate, Flow=Flow,
Flux=Flux, Std.Err=Std.Err, SEP=SEP,
L95=L95, U95=U95)
}
names(retval)[6L:7L] <- Names
} else if(by == "total") {
KDays <- seq(nrow(model.inp))
Flow <- newdata[[flow]]
Sum <- rowSums(model.inp) # Use to detect any missing values
## No reason to checkDays because total is total
if(any(drop <- is.na(Sum) & Flow > 0) && !allow.incomplete) {
warning("Missing values in newdata")
## Return all NAs
retval <- data.frame(Period="total", Ndays=NA_integer_,
Flux=NA_real_, Std.Err=NA_real_,
SEP=NA_real_, L95=NA_real_, U95=NA_real_)
return(retval) # Skip print
}
if(any(drop)) {
model.inp <- model.inp[!drop,]
Flow <- Flow[!drop]
KDays <- KDays[!drop]
}
Zdays <- which(Flow == 0)
NDIM <- as.integer(length(KDays))
NDays <- as.integer(NDIM/gps.nday)
if(length(Zdays))
KDays <- KDays[-Zdays]
out.data <- .Fortran("estltot",
NPAR=fit$lfit$NPAR,
PARMLE=fit$lfit$PARMLE,
BIAS=fit$lfit$BIAS,
CV_IN=fit$lfit$CV,
SBIAS=fit$lfit$SBIAS,
SCV_IN=fit$lfit$SCV,
npred=as.integer(length(KDays)),
xlpred=model.inp[KDays,],
NDAYS=NDays,
N_DAY=gps.nday,
SDEXACT=seopt=="exact",
load=double(1L),
loadvar=double(1L),
loadlow=double(1L),
loadup=double(1L),
loadsep=double(1L),
IERR=integer(1L))
if(out.data$IERR > 0) {
warning(" *** Error (",out.data$IERR,") occurred in processing total load. ***\n",sep="")
retval <- data.frame(Period="total", Ndays=NA_integer_,
Flux=NA_real_, Std.Err=NA_real_,
SEP=NA_real_, L95=NA_real_, U95=NA_real_)
} else {
## OK, we've had a successful run, pack the data into a data frame
retval <- data.frame(Period="total", Ndays=NDays,
Flux=out.data$load * lcf,
Std.Err=sqrt(out.data$loadvar) * lcf,
SEP=out.data$loadsep * lcf,
L95=out.data$loadlow * lcf,
U95=out.data$loadup * lcf)
}
} else { # Any other kind of aggregation
Flow <- newdata[[flow]] # Needed for summary
retval <- tapply(seq(nrow(model.inp)), newdata[[by]], FUN=function(period) {
KDays <- period
Flow <- newdata[period, flow]
Sum <- rowSums(model.inp[period,]) # Use to detect any missing values
drop <- is.na(Sum) & Flow > 0
## Check for incomplete data
OK <- TRUE
if(!allow.incomplete) {
NDays <- length(KDays) / gps.nday
if(any(drop)) { # Any missing values?
OK <- FALSE
retby <- data.frame(Period="period", Ndays=NDays,
Flux=NA_real_, Std.Err=NA_real_,
SEP=NA_real_, L95=NA_real_, U95=NA_real_)
} else {
## Check for complete periods
Period <- as.character(newdata[[by]][period[1L]])
if(!is.null(targN <- checkDays[[Period]])) {
if(targN != NDays) {
OK <- FALSE
retby <- data.frame(Period="period", Ndays=NDays,
Flux=NA_real_, Std.Err=NA_real_,
SEP=NA_real_, L95=NA_real_, U95=NA_real_)
}
}
}
}
if(OK) {
if(any(drop)) {
KDays <- KDays[!drop]
model.inp <- model.inp[KDays,]
Flow <- Flow[!drop]
}
Zdays <- which(Flow == 0)
NDIM <- as.integer(length(KDays))
NDays <- as.integer(NDIM/gps.nday)
if(length(Zdays))
KDays <- KDays[-Zdays]
out.data <- .Fortran("estltot",
NPAR=fit$lfit$NPAR,
PARMLE=fit$lfit$PARMLE,
BIAS=fit$lfit$BIAS,
CV_IN=fit$lfit$CV,
SBIAS=fit$lfit$SBIAS,
SCV_IN=fit$lfit$SCV,
npred=as.integer(length(KDays)),
xlpred=model.inp[KDays,],
NDAYS=NDays,
N_DAY=gps.nday,
SDEXACT=seopt=="exact",
load=double(1L),
loadvar=double(1L),
loadlow=double(1L),
loadup=double(1L),
loadsep=double(1L),
IERR=integer(1L))
if(out.data$IERR > 0) {
warning(" *** Error (",out.data$IERR,") occurred in processing data. ***\n",sep="")
retby <- data.frame(Period="period", Ndays=NA_integer_,
Flux=NA_real_, Std.Err=NA_real_,
SEP=NA_real_, L95=NA_real_, U95=NA_real_)
} else {
## OK, we've had a successful run, pack the data into a data frame
retby <- data.frame(Period="period", Ndays=NDays,
Flux=out.data$load * lcf,
Std.Err=sqrt(out.data$loadvar) * lcf,
SEP=out.data$loadsep * lcf,
L95=out.data$loadlow * lcf,
U95=out.data$loadup * lcf)
}
}
return(retby)
}
) # End of tapply
Period <- names(retval)
retval <- do.call(rbind, retval)
retval$Period <- Period
} # End of else any other aggregation
rownames(retval) <- NULL
if(print) {
Date <- newdata[[dates]]
cat("\n---------------------------------------------------------------------\n",
"Constituent Output File Part IIa: Estimation (test for extrapolation)\n",
" Load Estimates for ", as.character(min(Date)),
" to ", as.character(max(Date)), "\n",
"---------------------------------------------------------------------\n\n",
sep="")
cat("Streamflow Summary Statistics\n",
"-------------------------------------------\n\n", sep="")
Qsum <- rbind(Cal.=fit$Sum.flow, Est.=summary(Flow))
if(Qsum[2L, ncol(Qsum)] > Qsum[1L, ncol(Qsum)]) {
cat("WARNING: The maximum estimation data set steamflow exceeds the maximum\n",
"calibration data set streamflow. Load estimates require extrapolation.\n\n",
sep="")
} else if(Qsum[2L, 1L] < Qsum[1L, 1L]) {
cat("WARNING: The minimum estimation data set steamflow exceeds the minimum\n",
"calibration data set streamflow. Load estimates require extrapolation.\n\n",
sep="")
} else {
cat("The maximum estimation data set streamflow does not exceed the maximum\n",
"calibration data set streamflow. No extrapolation is required.\n\n",
sep="")
}
if(!(by %in% c("day", "unit"))) {
cat("\n-------------------------------------------------------------\n",
"Constituent Output File Part IIb: Estimation (Load Estimates)\n",
" Load Estimates for ", as.character(min(Date)),
" to ", as.character(max(Date)), "\n",
"---------------------------------------------------------------\n\n",
sep="")
cat("Flux Estimates, in ", load.units, "/d, using ",
fit$lfit$method,
"\n----------------------------------------\n\n", sep="")
print(retval)
} else
cat("Estimates for daily or instantaneous values not printed\n\n")
return(invisible(retval))
} else
return(retval)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.