R/prep_values.R

Defines functions pct_below_5000 pct_below_240 pct_below geomean none min_sample_check agg_eval_stat rolling_early rolling split_eval_stat prep_values block_switch

Documented in agg_eval_stat block_switch geomean min_sample_check none pct_below pct_below_240 pct_below_5000 prep_values rolling rolling_early split_eval_stat

#' Identify the columns to group by based on the specified block
#'
#' @param .vec a character vector representing: 1) "all", 2) "date",
#'  3) "single, 4) "month, or 5) "30-day.
#' @return A vector of column names to be selected for grouping

block_switch <- function(.vec) {
  # Includes within period to insure downstream that samples outside of the
  # sampling period do not contribute to the assessment of the samples within
  # the sampling period.
  switch(
    EXPR = .vec,
    "single" = "row_number",
    "all" = c("assessment_id", "statistic", "within_period"),
    "date" = c("assessment_id", "statistic", "date", "within_period"),
    "month" = c("assessment_id", "statistic", "year",
                "month", "within_period"),
    "30-day" = c("assessment_id", "statistic", "within_period"),
    stop(
      ".vec must be one of the following: 'all', 'date',
              'single', 'month', or '30-day'"
    )
  )
}



#' Prepare reported chemistry values
#' Groups the data according the the block specified in the water quality
#' standards (WQS) table. Splits the data by the statistics column in the
#' WQS's table. Aregates the split data by group and applies the specified
#' statistic.
#' @param .data a data frame
#' @param .block_col aa character string representing the name of a column in
#'  \code{.data} that specifies how to block/group the data.
#' @param .value_col a character string representing the name of a column in
#' \code{.data} that represents the reported parameter value.
#' @param .statistic_col a character string representing the name of a column
#' in \code{.data} that specifies the statistical function to be applied to
#' the values in \code{.value_col} based on the groups defined by
#' \code{.block_col}.
#' @param .new_value_col a character string representing the name of
#' aggregated value.
#' @return A data frame.
#' @examples
#' @export
# .data <- chem.df
# .block_col <- "block"
# .value_col <- "result_numeric"
# .statistic_col <- "statistic"

prep_values <- function(.data, .block_col, .value_col, .statistic_col,
                        .min_n_col,
                        .new_value_col) {

  group_cols.list <- lapply(.data[[.block_col]], block_switch)

  # Row number is added as a column because it will be the grouping
  # variable specified when the blocking feature is specified as "single."
  .data$row_number <- rownames(.data)

  .data$group <- group_id(.data = .data,
                          .keep = group_cols.list,
                          .collapse = ":")

  min_check.df <- min_sample_check(.data = .data,
                               .group_col = "group",
                               .min_n_col = .min_n_col,
                               .new_col = "stat_min_n")

  by.list <- by(min_check.df,
                min_check.df$stat_min_n,
                FUN = function(i){
                  if (unique(i$stat_min_n) == TRUE) {
                    split_eval_stat(.data = i,
                                    .group_col = "group",
                                    .value_col = .value_col,
                                    .statistic_col = .statistic_col,
                                    .block_col = .block_col,
                                    .new_value_col = .new_value_col)
                  } else if (unique(i$stat_min_n) == FALSE) {
                    i[.new_value_col] <- NA_real_
                    return(i)
                  } else {
                    stop("state_min_n must be TRUE or FALSE.")
                  }
                })


  final.df <- do.call(rbind, by.list)

  return(final.df)
}


#' Prepare reported chemistry values
#' Groups the data according the the block specified in the water quality
#' standards (WQS) table. Splits the data by the statistics column in the
#' WQS's table. Aregates the split data by group and applies the specified
#' statistic.
#' @param .data a data frame
#' @param .block_col aa character string representing the name of a column in
#'  \code{.data} that specifies how to block/group the data.
#' @param .value_col a character string representing the name of a column in
#' \code{.data} that represents the reported parameter value.
#' @param .group_col a character string representing the name of the column
#' in .data containing a value to group .data.
#' @param .statistic_col a character string representing the name of a column
#' in \code{.data} that specifies the statistical function to be applied to
#' the values in \code{.value_col} based on the groups defined by
#' \code{.block_col}.
#' @param .new_value_col a character string representing the name of
#' aggregated value.
#' @return A data frame.
#' @examples
#' @export

split_eval_stat <- function(.data, .block_col, .value_col, .statistic_col,
                            .group_col,
                            .new_value_col){
  value.list <- by(data = .data,
                   INDICES = .data[.statistic_col],
                   FUN = function(i) {
                     agg_eval_stat(.data = i,
                                   .group_col = .group_col,
                                   .value_col = .value_col,
                                   .statistic_col = .statistic_col,
                                   .block_col = .block_col)

                   })

  value.df <- do.call(rbind, value.list)
  names(value.df)[names(value.df) == .value_col] <- .new_value_col

  regroup.df <- merge(x = value.df,
                      y = .data,
                      by = c("group", .block_col, .statistic_col))

  return(regroup.df)
}

#' A function to handle rolling window calculations.
#' This function was developed to prepare total coliform values for assessment.
#'
#' @param .group a vector representing group identifier.
#' @param .date a date vector representing collection dates.
#' @param .value a numeric vector of parameter values.
#' @param .rolling_days a single numeric value indicating the number of days
#' that should be used to create the rolling window.
#' @param .min_n a single numeric value representing the minimum number of
#' samples required.
#' @param .fun a function to summarize the \code{.value} within the
#' \code{.rolling_days} window.
#' @return a data frame.

rolling <- function(.group, .date, .value, .rolling_days, .min_n, .fun) {
  # Check that the function exits -------------------------------------------
  if (!exists(.fun)) stop(paste(".fun not recognized. You supplied:", .fun))

  # Check minimum number of values present ----------------------------------
  if (length(.value[!is.na(.value)]) < .min_n) {
    return(
      rolling_early(.group = .group, .date = .date, .value = .value)
      )
  }

  # Check is numeric -------------------------------------------------
  if (!is.numeric(.value)) stop(paste(".value must be numeric.",
                                     "You supplied:", class(.value)))
  if (!is.numeric(.min_n)) stop(paste(".min_n must be numeric.",
                                     "You supplied:", class(.min_n)))
  if (!is.numeric(.rolling_days)) stop(paste(".rolling_days must be numeric.",
                                      "You supplied:", class(.rolling_days)))

  # Check length ------------------------------------------------------------
  if (length(.min_n) != 1) stop(paste(".min_n must be length 1.",
                                      "You supplied an object of length:",
                                      length(.min_n)))
  if (length(.rolling_days) != 1) stop(paste(".rolling_days must be length 1.",
                                      "You supplied an object of length:",
                                      length(.rolling_days)))

  # Check .date is class date -----------------------------------------------
  if (!is_date(.date = .date))
    stop(paste(".date must be class date.",
               "You supplied class:",
               class(.date)))

  # Create data frame from vectors ------------------------------------------
  new.df <- data.frame(group = .group,
                       date = .date,
                       value = .value,
                       stringsAsFactors = FALSE)

  # Create data frame with each row representing the range of dates  --------
  date.df <- data.frame(date = seq.Date(min(.date),
                                        max(.date),
                                        by = "1 day"))

  # Merge the data frames to complete the dates associated with .val --------
  merged.df <- merge(date.df, new.df, by = "date", all.x = TRUE)

  # Add row-number to reference during subsetting ---------------------------
  merged.df$row_num <- rownames(merged.df)


  # Create a list of non-NA rows within the specified window ----------------
  row_num.list <- lapply(seq_along(merged.df$date), function(i) {
    sub.df <- merged.df[i:(i + .rolling_days), ]
    sub.df[!is.na(sub.df["value"]), "row_num"]
  })

  # Exclude elements without the minimum number of required samples ---------
  min_req.list <- row_num.list[vapply(row_num.list, function(i) {
    length(i) > .min_n
  }, NA) ]
  # Return early if length zero
  if (length(min_req.list) == 0) return(rolling_early(.group = .group,
                                                      .date = .date,
                                                      .value = .value))
  # Keep only unique combinations of rows -----------------------------------
  unique.list <- unique(min_req.list)
  # Return early if length zero
  if (length(unique.list) == 0) return(rolling_early(.group = .group,
                                                     .date = .date,
                                                     .value = .value))
  # Drop subsets of other elements ------------------------------------------

  drop_subsets.list <- lapply(unique.list, function(i) {

    check.list <- lapply(X = unique.list[!unique.list %in% list(i)],
                         FUN = function(j) {
                           i %in% j
                         } )
    if (any(vapply(X = check.list,
                   FUN = all,
                   FUN.VALUE = NA))) {
      return(NA)
    } else {
      return(i)
    }
  })

  drop_subsets.list <- drop_subsets.list[!is.na(drop_subsets.list)]

  # Return early if length zero
  if (length(drop_subsets.list) == 0) return(rolling_early(.group = .group,
                                                           .date = .date,
                                                           .value = .value))

  # For each window, nest the dates and values and find the mean ------------
  final.list <- lapply(drop_subsets.list, function(i) {
    sub.i <- merged.df[merged.df$row_num %in% i, ]
    final.df <- data.frame(group = unique(sub.i$group),
                           dates = I(list(as.character(sub.i$date))),
                           values = I(list(sub.i$value)),
                           stringsAsFactors = FALSE)
    final.df$value <- do.call(.fun, list(sub.i$value, na.rm = TRUE))
    return(final.df)
  })

  # Append the lists of data frames together --------------------------------
  final.df <- do.call(rbind, final.list)

  return(final.df)
}

# helpers -----------------------------------------------------------------
#' Standardized output when rolling() must return early.
#'
#' @param .group a vector representing group identifier.
#' @param .date a date vector representing collection dates.
#' @param .value a numeric vector of parameter values.
#' @return a data frame

rolling_early <- function(.group, .date, .value) {
  data.frame(group = unique(.group),
             dates = I(list(as.character(.date))),
             values = I(list(.value)),
             value = NA_real_,
             stringsAsFactors = FALSE)
}

#' Aggregate and evaluate statistic
#'
#' @param  .data a data frame.
#' @param .value_col a character string representing the name of the column
#' in .data containing observed parameter values.
#' @param .group_col a character string representing the name of the column
#' in .data containing a value to group .data.
#' @param .statistic a character string representing the name of the column
#' in .data containing the name of a statistical function.
#' @param .block_col a character string representing the name of the column
#' in .data containing a string used to block (aggregate) the data.
#' @return a data frame.

agg_eval_stat <- function(.data, .value_col, .group_col,
                          .statistic_col, .block_col) {
  with(.data,
       aggregate(as.formula(paste(.value_col,
                                  "~",
                                  .group_col,
                                  "+",
                                  .statistic_col,
                                  "+",
                                  .block_col)),
                 FUN = parse_eval(.data[[.statistic_col]]),
                 na.action = "na.pass"))
}

#' Check that the minimum number of samples were collected to calculate a
#' given statistic.
#'
#' @param  .data a data frame.
#' @param .group_col a character string representing the name of the column
#' in .data containing a value to group .data.
#' @param .min_n_col a character string representing the name of the column
#' in .data containing minimum number of samples required for compare against
#' a water quality standard.
#' @param .result_col a character string representing the name of the column
#' in .data containing observed/summarized parameter values.
#' @return a data frame.

min_sample_check <- function(.data, .group_col, .min_n_col, .new_col) {
  freq.df <- as.data.frame(table(.data[.group_col]),
                           stringsAsFactors = FALSE)
  names(freq.df) <- c(.group_col, "sample_count")

  final.df <- merge(x = freq.df,
                    y = .data,
                    by = .group_col)

  # final.df[.result_col] <- ifelse(final.df$sample_count < final.df[.min_n_col],
  #                                 NA_real_,
  #                                 final.df[[.result_col]])

  final.df[.new_col] <- final.df$sample_count >= final.df[.min_n_col]
  return(final.df)
}

#' No statistic specified; return orignal state
#'
#' @param .x an R object
#' @return returns the R object supplied. The Water Quality Standards
#' (WQS) table includes a column that specifies the function to be used
#' to aggregate data. To avoid implicit NA's when no statistic is required,
#' an explicit "none" was supplied. This function simplifies the process of
#' prep_values() because it can be supplied to the aggregate function in the
#' same fashion as functions such as min or mean.

none <- function(.x) {
  return(.x)
}

#' Geometric Mean
#'
#' @param .x a numeric vector of reported values.
#' @return a numeric vector

geomean <- function(.x, na.rm = TRUE) {
  if (na.rm == TRUE) .x <- .x[!is.na(.x)] # Drop NAs
  prod(.x) ^ (1 / length(.x))
  # exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

#' Percentage of samples below a threshold.
#' Designed for total coliform water quality standards.
#' @param .x a numeric vector of reported values.
#' @param .thresh a single numeric value representing a threshold.
#' @param na.rm a logical value indicating if NA values should be
#' removed (TRUE) or keep NA values (FALSE).
#' @return a numeric vector

pct_below <- function(.x, .thresh, na.rm = TRUE) {
  if (na.rm == TRUE) .x <- .x[!is.na(.x)] # Drop NAs
  sum(.x <= .thresh) / length(.x) * 100
}

#' Percentage of samples below a 240
#' Designed for total coliform water quality standards.
#' @param .x a numeric vector of reported values.
#' @param na.rm a logical value indicating if NA values should be
#' removed (TRUE) or keep NA values (FALSE).
#' @return a numeric vector

pct_below_240 <- function(.x, na.rm = TRUE)  {
  pct_below(.x = .x, .thresh = 240, na.rm = na.rm)
}

#' Percentage of samples below a 5,000
#' Designed for total coliform water quality standards.
#' @param .x a numeric vector of reported values.
#' @param na.rm a logical value indicating if NA values should be
#' removed (TRUE) or keep NA values (FALSE).
#' @return a numeric vector

pct_below_5000 <- function(.x, na.rm = TRUE)  {
  pct_below(.x = .x, .thresh = 5000, na.rm = na.rm)
}
BWAM/stayCALM documentation built on May 21, 2020, 3:24 p.m.