#' Identify the columns to group by based on the specified block
#'
#' @param .vec a character vector representing: 1) "all", 2) "date",
#' 3) "single, 4) "month, or 5) "30-day.
#' @return A vector of column names to be selected for grouping
block_switch <- function(.vec) {
# Includes within period to insure downstream that samples outside of the
# sampling period do not contribute to the assessment of the samples within
# the sampling period.
switch(
EXPR = .vec,
"single" = "row_number",
"all" = c("assessment_id", "statistic", "within_period"),
"date" = c("assessment_id", "statistic", "date", "within_period"),
"month" = c("assessment_id", "statistic", "year",
"month", "within_period"),
"30-day" = c("assessment_id", "statistic", "within_period"),
stop(
".vec must be one of the following: 'all', 'date',
'single', 'month', or '30-day'"
)
)
}
#' Prepare reported chemistry values
#' Groups the data according the the block specified in the water quality
#' standards (WQS) table. Splits the data by the statistics column in the
#' WQS's table. Aregates the split data by group and applies the specified
#' statistic.
#' @param .data a data frame
#' @param .block_col aa character string representing the name of a column in
#' \code{.data} that specifies how to block/group the data.
#' @param .value_col a character string representing the name of a column in
#' \code{.data} that represents the reported parameter value.
#' @param .statistic_col a character string representing the name of a column
#' in \code{.data} that specifies the statistical function to be applied to
#' the values in \code{.value_col} based on the groups defined by
#' \code{.block_col}.
#' @param .new_value_col a character string representing the name of
#' aggregated value.
#' @return A data frame.
#' @examples
#' @export
# .data <- chem.df
# .block_col <- "block"
# .value_col <- "result_numeric"
# .statistic_col <- "statistic"
prep_values <- function(.data, .block_col, .value_col, .statistic_col,
.min_n_col,
.new_value_col) {
group_cols.list <- lapply(.data[[.block_col]], block_switch)
# Row number is added as a column because it will be the grouping
# variable specified when the blocking feature is specified as "single."
.data$row_number <- rownames(.data)
.data$group <- group_id(.data = .data,
.keep = group_cols.list,
.collapse = ":")
min_check.df <- min_sample_check(.data = .data,
.group_col = "group",
.min_n_col = .min_n_col,
.new_col = "stat_min_n")
by.list <- by(min_check.df,
min_check.df$stat_min_n,
FUN = function(i){
if (unique(i$stat_min_n) == TRUE) {
split_eval_stat(.data = i,
.group_col = "group",
.value_col = .value_col,
.statistic_col = .statistic_col,
.block_col = .block_col,
.new_value_col = .new_value_col)
} else if (unique(i$stat_min_n) == FALSE) {
i[.new_value_col] <- NA_real_
return(i)
} else {
stop("state_min_n must be TRUE or FALSE.")
}
})
final.df <- do.call(rbind, by.list)
return(final.df)
}
#' Prepare reported chemistry values
#' Groups the data according the the block specified in the water quality
#' standards (WQS) table. Splits the data by the statistics column in the
#' WQS's table. Aregates the split data by group and applies the specified
#' statistic.
#' @param .data a data frame
#' @param .block_col aa character string representing the name of a column in
#' \code{.data} that specifies how to block/group the data.
#' @param .value_col a character string representing the name of a column in
#' \code{.data} that represents the reported parameter value.
#' @param .group_col a character string representing the name of the column
#' in .data containing a value to group .data.
#' @param .statistic_col a character string representing the name of a column
#' in \code{.data} that specifies the statistical function to be applied to
#' the values in \code{.value_col} based on the groups defined by
#' \code{.block_col}.
#' @param .new_value_col a character string representing the name of
#' aggregated value.
#' @return A data frame.
#' @examples
#' @export
split_eval_stat <- function(.data, .block_col, .value_col, .statistic_col,
.group_col,
.new_value_col){
value.list <- by(data = .data,
INDICES = .data[.statistic_col],
FUN = function(i) {
agg_eval_stat(.data = i,
.group_col = .group_col,
.value_col = .value_col,
.statistic_col = .statistic_col,
.block_col = .block_col)
})
value.df <- do.call(rbind, value.list)
names(value.df)[names(value.df) == .value_col] <- .new_value_col
regroup.df <- merge(x = value.df,
y = .data,
by = c("group", .block_col, .statistic_col))
return(regroup.df)
}
#' A function to handle rolling window calculations.
#' This function was developed to prepare total coliform values for assessment.
#'
#' @param .group a vector representing group identifier.
#' @param .date a date vector representing collection dates.
#' @param .value a numeric vector of parameter values.
#' @param .rolling_days a single numeric value indicating the number of days
#' that should be used to create the rolling window.
#' @param .min_n a single numeric value representing the minimum number of
#' samples required.
#' @param .fun a function to summarize the \code{.value} within the
#' \code{.rolling_days} window.
#' @return a data frame.
rolling <- function(.group, .date, .value, .rolling_days, .min_n, .fun) {
# Check that the function exits -------------------------------------------
if (!exists(.fun)) stop(paste(".fun not recognized. You supplied:", .fun))
# Check minimum number of values present ----------------------------------
if (length(.value[!is.na(.value)]) < .min_n) {
return(
rolling_early(.group = .group, .date = .date, .value = .value)
)
}
# Check is numeric -------------------------------------------------
if (!is.numeric(.value)) stop(paste(".value must be numeric.",
"You supplied:", class(.value)))
if (!is.numeric(.min_n)) stop(paste(".min_n must be numeric.",
"You supplied:", class(.min_n)))
if (!is.numeric(.rolling_days)) stop(paste(".rolling_days must be numeric.",
"You supplied:", class(.rolling_days)))
# Check length ------------------------------------------------------------
if (length(.min_n) != 1) stop(paste(".min_n must be length 1.",
"You supplied an object of length:",
length(.min_n)))
if (length(.rolling_days) != 1) stop(paste(".rolling_days must be length 1.",
"You supplied an object of length:",
length(.rolling_days)))
# Check .date is class date -----------------------------------------------
if (!is_date(.date = .date))
stop(paste(".date must be class date.",
"You supplied class:",
class(.date)))
# Create data frame from vectors ------------------------------------------
new.df <- data.frame(group = .group,
date = .date,
value = .value,
stringsAsFactors = FALSE)
# Create data frame with each row representing the range of dates --------
date.df <- data.frame(date = seq.Date(min(.date),
max(.date),
by = "1 day"))
# Merge the data frames to complete the dates associated with .val --------
merged.df <- merge(date.df, new.df, by = "date", all.x = TRUE)
# Add row-number to reference during subsetting ---------------------------
merged.df$row_num <- rownames(merged.df)
# Create a list of non-NA rows within the specified window ----------------
row_num.list <- lapply(seq_along(merged.df$date), function(i) {
sub.df <- merged.df[i:(i + .rolling_days), ]
sub.df[!is.na(sub.df["value"]), "row_num"]
})
# Exclude elements without the minimum number of required samples ---------
min_req.list <- row_num.list[vapply(row_num.list, function(i) {
length(i) > .min_n
}, NA) ]
# Return early if length zero
if (length(min_req.list) == 0) return(rolling_early(.group = .group,
.date = .date,
.value = .value))
# Keep only unique combinations of rows -----------------------------------
unique.list <- unique(min_req.list)
# Return early if length zero
if (length(unique.list) == 0) return(rolling_early(.group = .group,
.date = .date,
.value = .value))
# Drop subsets of other elements ------------------------------------------
drop_subsets.list <- lapply(unique.list, function(i) {
check.list <- lapply(X = unique.list[!unique.list %in% list(i)],
FUN = function(j) {
i %in% j
} )
if (any(vapply(X = check.list,
FUN = all,
FUN.VALUE = NA))) {
return(NA)
} else {
return(i)
}
})
drop_subsets.list <- drop_subsets.list[!is.na(drop_subsets.list)]
# Return early if length zero
if (length(drop_subsets.list) == 0) return(rolling_early(.group = .group,
.date = .date,
.value = .value))
# For each window, nest the dates and values and find the mean ------------
final.list <- lapply(drop_subsets.list, function(i) {
sub.i <- merged.df[merged.df$row_num %in% i, ]
final.df <- data.frame(group = unique(sub.i$group),
dates = I(list(as.character(sub.i$date))),
values = I(list(sub.i$value)),
stringsAsFactors = FALSE)
final.df$value <- do.call(.fun, list(sub.i$value, na.rm = TRUE))
return(final.df)
})
# Append the lists of data frames together --------------------------------
final.df <- do.call(rbind, final.list)
return(final.df)
}
# helpers -----------------------------------------------------------------
#' Standardized output when rolling() must return early.
#'
#' @param .group a vector representing group identifier.
#' @param .date a date vector representing collection dates.
#' @param .value a numeric vector of parameter values.
#' @return a data frame
rolling_early <- function(.group, .date, .value) {
data.frame(group = unique(.group),
dates = I(list(as.character(.date))),
values = I(list(.value)),
value = NA_real_,
stringsAsFactors = FALSE)
}
#' Aggregate and evaluate statistic
#'
#' @param .data a data frame.
#' @param .value_col a character string representing the name of the column
#' in .data containing observed parameter values.
#' @param .group_col a character string representing the name of the column
#' in .data containing a value to group .data.
#' @param .statistic a character string representing the name of the column
#' in .data containing the name of a statistical function.
#' @param .block_col a character string representing the name of the column
#' in .data containing a string used to block (aggregate) the data.
#' @return a data frame.
agg_eval_stat <- function(.data, .value_col, .group_col,
.statistic_col, .block_col) {
with(.data,
aggregate(as.formula(paste(.value_col,
"~",
.group_col,
"+",
.statistic_col,
"+",
.block_col)),
FUN = parse_eval(.data[[.statistic_col]]),
na.action = "na.pass"))
}
#' Check that the minimum number of samples were collected to calculate a
#' given statistic.
#'
#' @param .data a data frame.
#' @param .group_col a character string representing the name of the column
#' in .data containing a value to group .data.
#' @param .min_n_col a character string representing the name of the column
#' in .data containing minimum number of samples required for compare against
#' a water quality standard.
#' @param .result_col a character string representing the name of the column
#' in .data containing observed/summarized parameter values.
#' @return a data frame.
min_sample_check <- function(.data, .group_col, .min_n_col, .new_col) {
freq.df <- as.data.frame(table(.data[.group_col]),
stringsAsFactors = FALSE)
names(freq.df) <- c(.group_col, "sample_count")
final.df <- merge(x = freq.df,
y = .data,
by = .group_col)
# final.df[.result_col] <- ifelse(final.df$sample_count < final.df[.min_n_col],
# NA_real_,
# final.df[[.result_col]])
final.df[.new_col] <- final.df$sample_count >= final.df[.min_n_col]
return(final.df)
}
#' No statistic specified; return orignal state
#'
#' @param .x an R object
#' @return returns the R object supplied. The Water Quality Standards
#' (WQS) table includes a column that specifies the function to be used
#' to aggregate data. To avoid implicit NA's when no statistic is required,
#' an explicit "none" was supplied. This function simplifies the process of
#' prep_values() because it can be supplied to the aggregate function in the
#' same fashion as functions such as min or mean.
none <- function(.x) {
return(.x)
}
#' Geometric Mean
#'
#' @param .x a numeric vector of reported values.
#' @return a numeric vector
geomean <- function(.x, na.rm = TRUE) {
if (na.rm == TRUE) .x <- .x[!is.na(.x)] # Drop NAs
prod(.x) ^ (1 / length(.x))
# exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}
#' Percentage of samples below a threshold.
#' Designed for total coliform water quality standards.
#' @param .x a numeric vector of reported values.
#' @param .thresh a single numeric value representing a threshold.
#' @param na.rm a logical value indicating if NA values should be
#' removed (TRUE) or keep NA values (FALSE).
#' @return a numeric vector
pct_below <- function(.x, .thresh, na.rm = TRUE) {
if (na.rm == TRUE) .x <- .x[!is.na(.x)] # Drop NAs
sum(.x <= .thresh) / length(.x) * 100
}
#' Percentage of samples below a 240
#' Designed for total coliform water quality standards.
#' @param .x a numeric vector of reported values.
#' @param na.rm a logical value indicating if NA values should be
#' removed (TRUE) or keep NA values (FALSE).
#' @return a numeric vector
pct_below_240 <- function(.x, na.rm = TRUE) {
pct_below(.x = .x, .thresh = 240, na.rm = na.rm)
}
#' Percentage of samples below a 5,000
#' Designed for total coliform water quality standards.
#' @param .x a numeric vector of reported values.
#' @param na.rm a logical value indicating if NA values should be
#' removed (TRUE) or keep NA values (FALSE).
#' @return a numeric vector
pct_below_5000 <- function(.x, na.rm = TRUE) {
pct_below(.x = .x, .thresh = 5000, na.rm = na.rm)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.