#' Process Observations
#'
#' This function processes measurements from observation sites in a monitoring network.
#'
#' @param observations data.frame.
#' Observed data records, see \sQuote{Details} section.
#' @param parameters data.frame.
#' Parameter descriptions, see \sQuote{Details} section.
#' @param detection.limits data.frame.
#' Detection limits, see \sQuote{Details} section.
#' @param date.fmt character.
#' Date format used to convert character strings to Date class.
#'
#' @details The \code{observations}, \code{parameters}, and \code{detection.limits}
#' data tables are composed of character-class components.
#'
#' Required columns in the \code{observations} data table include:
#' \code{Site_id}, a unique site identifier;
#' \code{Site_name}, a local site name; and
#' \code{Date}, the measurement date.
#' Measured values are checked for an optional character code in the first digit of each character string value.
#' Character codes are identified using the following criteria:
#' \emph{<}, below reporting level;
#' \emph{E}, estimated value;
#' \emph{V}, contaminated; and
#' \emph{U}, undetectable.
#' Values are stripped of their character code and converted to numeric class.
#' A warning is given if the character code is not recognized and its value set to NA.
#' Measured values with character codes of \emph{V} and \emph{U} are also set to NA.
#'
#' Required columns in the \code{parameters} data table include:
#' \code{Parameter_id}, a unique parameter identifier;
#' \code{Parameter_name}, the common parameter name;
#' \code{Units}, the units associated with the measured parameter values; and
#' \code{sd}, a column name in \code{observations} data table where the parameters standard deviation values are located.
#'
#' A required column in the \code{detection.limits} data table is \code{Date},
#' the date when the detection limit was first implemented.
#' Detection limit values are located in subsequent columns;
#' a unique parameter identifier is specified for each of these column names.
#'
#' A measured value is converted to censored data under the following conditions:
#' (1) the measured value is below the reporting level and represented as \emph{left-censored} data; or
#' (2) there is a standard deviation and detection limit associated with the measured value,
#' therefore, it is represented as \emph{interval-censored} data.
#' The upper and lower bounds of interval-censored data are calculated by
#' adding and subtracting three standard deviations from the measured value, respectively.
#' Interval-censored data with a lower or upper bound less than the detection limit is represented as left-censored data.
#' For left-censored data, the upper bound is set to the detection limit when its magnitude is less than the detection limit.
#'
#' @return Returns an object of class list with data.frame components corresponding to unique parameter identifiers.
#' Each data table has the following components:
#' \describe{
#' \item{Site_id}{unique site identifier}
#' \item{Site_name}{local site name}
#' \item{Date}{observation date}
#' \item{code}{single-digit character code}
#' \item{value}{ measured value}
#' \item{sd}{standard deviation of the measured value.}
#' \item{dl}{detection limit of the measured value.}
#' \item{surv}{observation as censored or uncensored data of \emph{interval} type.}
#' }
#' Additional attributes associated with the returned data frame include:
#' \code{Parameter_id}, unique parameter identifier;
#' \code{Parameter_name}, parameter name; and
#' \code{Units}, parameter units.
#'
#' @author J.C. Fisher, U.S. Geological Survey, Idaho Water Science Center
#'
#' @seealso \code{\link{RunAnalysis}}
#'
#' @keywords methods
#'
#' @export
#'
ProcessObs <- function(observations, parameters, detection.limits=NULL,
date.fmt="%Y-%m-%d") {
observations$Date <- as.Date(observations$Date, format=date.fmt)
observations <- observations[!is.na(observations$Date), ]
par.names <- make.names(parameters$Parameter_id)
par.names <- sort(par.names[par.names %in% colnames(observations)])
detection.limits$Date <- as.Date(detection.limits$Date, format=date.fmt)
detection.limits[, -1] <- apply(detection.limits[, -1], 2, as.numeric)
lst <- list()
for (i in seq_along(par.names)) {
nam <- par.names[i]
is.rec <- !is.na(observations[[nam]]) & observations[[nam]] != ""
if (all(!is.rec))
next
d <- observations[is.rec, c("Site_id", "Site_name", "Date")]
d <- data.frame(d, code=NA, value=NA, sd=NA, dl=NA)
d$Site_id <- as.factor(d$Site_id)
d$Site_name <- as.factor(d$Site_name)
d$value <- as.character(observations[is.rec, par.names[i]])
d$code <- substr(d$value, 1, 1)
d$code[!d$code %in% c("<", "E", "V", "U")] <- ""
d$code <- as.factor(d$code)
is.code <- d$code != ""
d$value[is.code] <- substr(d$value[is.code], 2, nchar(d$value[is.code]))
d$value <- as.numeric(d$value)
d$value[d$code %in% c("V", "U")] <- NA
p <- parameters[match(nam, make.names(parameters$Parameter_id)), , drop=TRUE]
sd.col <- p$sd
idx <- ifelse(!is.na(sd.col), match(sd.col, colnames(observations)), NA)
d$sd <- if (is.na(idx)) NA else as.numeric(observations[is.rec, idx])
if (nam %in% colnames(detection.limits)) {
dl <- detection.limits[!is.na(detection.limits[[nam]]), c("Date", nam)]
for (id in levels(d$Site_id)) {
idxs <- which(d$Site_id == id)
breaks <- findInterval(as.numeric(dl$Date), as.numeric(d$Date[idxs]))
breaks <- unique(c(breaks, length(idxs) + 1L))
d$dl[idxs] <- dl[as.integer(cut(seq_along(idxs), breaks)), nam]
}
}
t1 <- t2 <- rep(NA, nrow(d))
is.left <- d$code %in% "<"
is.exact <- !is.left & (is.na(d$sd) | is.na(d$dl))
t1[is.exact] <- d$value[is.exact]
t2[is.exact] <- d$value[is.exact]
t2[is.left] <- d$value[is.left]
lower.value <- d$value - 3 * d$sd
upper.value <- d$value + 3 * d$sd
are.left <- which(!is.exact & upper.value <= d$dl)
t2[are.left] <- d$dl[are.left]
are.left <- which(!is.exact & lower.value <= d$dl & upper.value > d$dl)
t2[are.left] <- upper.value[are.left]
are.interval <- which(!is.exact & lower.value > d$dl)
t1[are.interval] <- lower.value[are.interval]
t2[are.interval] <- upper.value[are.interval]
t2[which(t2 <= 0)] <- NA # TODO: zero is invalid for 'lognormal' distribution
t1[which(t1 <= 0 | is.na(t2))] <- NA
d$surv <- Surv(time=t1, time2=t2, type="interval2")
d <- d[order(d$Site_name, d$Date), ]
vars <- c("Parameter_id", "Parameter_name", "Units")
attributes(d) <- c(attributes(d), p[vars])
lst[[p$Parameter_id]] <- d
}
return(lst)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.