Introduction

This documentation describes how water chemistry data collected by the New York State Department of Environmental Conservation's (NYSDEC's) Stream Monitoring and Assessment Section (SMAS) is prepared for assessment using the stayCALM R-package. Much of this work will not be necessary once the database is finalized. There will still be some amount of data preparation, but it should not be as extensive.

Preprocessing

Prepare the R Environment

Load the necessary R packages into the global environment.

library(tidyverse)
library(readxl)
library(stayCALM)

Import the water quality standards (WQS) table from the stayCALM package.

data(nysdec_wqs)
standard_dt <- function(x) {
    DT::datatable(x, 
    options = list(
      columnDefs = list(list(
        className = 'dt-center'
      )),
      scrollY = 300,
      scroller = TRUE,
      scrollX = TRUE
    )
  )
}

Load SMAS Chemistry Data

EQuIS Data

The code below defines the file path to the SMAS chemistry data provided by G. Lemley.

smas.path <- file.path(here::here(),
                       "data-raw",
                       "smas",
                       "2018-2019",
                       "qa")

Import SMAS sample information and result data.

sample.df <- vroom::vroom(file.path(smas.path,
                                    "2020_05_01_S_Site.csv")) %>% 
  rename_all(tolower)  %>% 
  select(-site_id) %>% 
  rename(site_id = site_history_id)

result.df <- vroom::vroom(file.path(smas.path,
                                    "SMAS_chem_2018-2018_flagged_simple_2020-05-13.csv")) %>% 
  rename_all(tolower) %>% 
  mutate(validator_qualifiers = as.character(validator_qualifiers))

insitu.df <- vroom::vroom(file.path(smas.path,
                                    "S_IN_SITU_WATER_CHEM_2020_05_04.csv")) %>% 
  rename_all(tolower)
prep_insitu_1.df <- insitu.df %>% 
  select(event_smas_id,
         chemical_name = parameter_name,
         result_value = result,
         result_unit = unit) %>% 
  distinct() %>% 
  separate(event_smas_id, c("site_id", "date"), sep = "_(?=[^_]+$)") %>% 
  mutate(date = as.Date(date, "%Y%m%d")) %>% 
  filter(date >= as.Date("2018-01-01"))

seg_id.df <- unique(subset(sample.df, select = c("site_id", "site_pwl_id")))

prep_insitu.df <- left_join(prep_insitu_1.df, seg_id.df, "site_id") %>% 
  mutate_if(is.character, tolower)

# result.df <- result.df %>%    
#   mutate(sample_date = if_else(nchar(sample_date) > 10,
#                                 lubridate::mdy_hms(sample_date,
#                                                    quiet = TRUE,
#                                                    tz = "America/New_York"),
#                                 as.POSIXct(lubridate::mdy(sample_date,
#                                                           quiet = TRUE,
#                                                           tz = "America/New_York"))),
#          date = as.Date(sample_date)) 
# 
# check_join.df <- anti_join(result.df, prep_insitu.df, by = c("site_id", "date"))

Join the EQuIS sample information and result data by "SYS_SAMPLE_CODE".

test <- anti_join(result.df, sample.df, by = "site_id")
streams_chem.df <- left_join(result.df, sample.df, 
                              by = c("site_id")) 

Check streams_chem.df for duplicate values.

dup_test.df <- streams_chem.df %>% 
  group_by(sys_sample_code, sample_date, chemical_name, fraction, result_value) %>% 
  summarize(n = n()) %>% 
  filter(n > 1)

if (nrow(dup_test.df) > 0) stop("Check `streams_chem.df` for duplicates.")

Data Wrangling

Rename the reported chemical_name strings in streams_chem.df to match the parameter strings in the WQS table (nysdec_wqs).

clean_up.df <- streams_chem.df %>%
  mutate(chemical_name = case_when(
    chemical_name %in% "chloride (as cl)" ~ "chloride",
    chemical_name %in% "coliform" ~ "total_coliform",
    chemical_name %in% "fecal coliform" ~ "fecal_coliforms",
    chemical_name %in% "fluoride, undistilled" ~ "fluoride",
    chemical_name %in% "hardness (as caco3)" ~ "hardness",
    chemical_name %in% "nitrate+nitrite as nitrogen" ~ "nitrate_nitrite",
    chemical_name %in% "nitrogen, ammonia (as n)" ~ "ammonia",
    chemical_name %in% "nitrogen, nitrate (as n)" ~ "nitrate",
    chemical_name %in% "nitrogen, nitrite" ~ "nitrite",
    chemical_name %in% "phosphorus, total (as p)" ~ "phosphorus",
    chemical_name %in% "sulfate (as so4)" ~ "sulfate",
    chemical_name %in% "total dissolved solids (residue, filterable)" ~ "total_dissolved_solids",
    TRUE ~ chemical_name
  ))

Clean up sample dates and create a date column without the sample time. Sample time is not necessary for assessment purposes.

clean_up.df <- clean_up.df %>% 
   mutate(sample_date = if_else(nchar(sample_date) > 10,
                                lubridate::mdy_hms(sample_date,
                                                   quiet = TRUE,
                                                   tz = "America/New_York"),
                                as.POSIXct(lubridate::mdy(sample_date,
                                                          quiet = TRUE,
                                                          tz = "America/New_York"))),
         date = as.Date(sample_date)) 

Append the in situ data to the chemical data.

clean_up.df <- bind_rows(clean_up.df, prep_insitu.df)

Duplicates

Confirm that there are no duplicate chemicals reported for the same sample.

dups.df <- clean_up.df %>%
  group_by_at(vars(-result_value, -result_unit)) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  filter(n > 1) %>%
  select(n, everything()) %>%
  arrange(chemical_name, sys_sample_code)

if (nrow(dups.df) > 0 ) {
  vroom::vroom_write(dups.df, file.path(here::here(), "data-raw", "dups.csv"), delim = ",")
  stop("Check dups.df for duplicate values.")
}

Final Formatting

Rename multiple columns to the standard naming convention required by the stayCALM package.

smas.df <- clean_up.df %>%
  rename(seg_id = site_pwl_id,
         units = result_unit,
         parameter = chemical_name,
         data_provider = project_name)

Create a sample_id column representing the concetenation of site_id and date to enable aggregation by individual samples.

smas.df$sample_id <- paste(smas.df$site_id, smas.df$date, sep = "_")

Reformat the reported chemical fraction, which should be "total" or "dissolved."

smas.df <- smas.df %>% 
  mutate(fraction = case_when(
           fraction %in% "T" ~ "total",
           fraction %in% "D" ~ "dissolved",
           is.na(fraction) & parameter %in% c("ph",
                                              "total_coliforms",
                                              "fecal_coliforms") ~ "total",
           parameter %in% c("dissolved_oxygen",
                                              "total_dissolved_solids") ~ "dissolved",
           TRUE ~ "ERROR"
         ))

Create a value column that represents reported result_value when available and the quantitation_limit when the parameter is reported as undetected. Using the quantitation_limit provides a numeric value that can be compared against a WQS. Although the quantitation_limit may not represent an accurate value for a given sample, it is accurate enough to compare against a WQS; there should be no WQSs below quantitation_limit, and therefore all undetected parameters should be attaining WQSs.

smas.df$value <- ifelse(smas.df$validator_qualifiers %in% "U",
                        as.numeric(smas.df$quantitation_limit),
                        smas.df$result_value)

Convert all character values in smas.df to lowercase. This makes it easier to work with the data and prevents miss-matching due to case-sensitivity.

smas.df <- mutate_if(smas.df, is.character, tolower)

Ensure that the parameter units in smas.df match the units of the WQS in nysdec_wqs.

keep.vec <- c("parameter", "fraction", "units")

merged.df <- merge(unique(smas.df[keep.vec]),
                   unique(nysdec_wqs[keep.vec]),
                   by = c("parameter", "fraction"))

merged.df$units_comp <- paste(merged.df$units.x, merged.df$units.y, sep = ":")

names(merged.df)[names(merged.df) %in% "units.x"] <- "units"

units_comp.df <- merge(smas.df, merged.df, by = keep.vec,
                       all.x = TRUE)

units_comp.df$units_comp <- ifelse(is.na(units_comp.df$units_comp),
                                   "no_match",
                                   units_comp.df$units_comp)

split.list <- by(units_comp.df, units_comp.df$units_comp,
                 function(i) {
                   units_comp.scalar <- unique(i$units_comp)
                   if (units_comp.scalar %in% "mg/l:ug/l") {
                     i$value <- i$value * 1000
                     i$units <- "ug/l"
                   }
                   if (units_comp.scalar %in% "ug/l:mg/l") {
                     i$value <- i$value / 1000
                     i$units <- "mg/l"
                   }
                   if (units_comp.scalar %in% "ng/l:ug/l") {
                     i$value <- i$value / 1000
                     i$units <- "ug/l"
                   }
                   if (units_comp.scalar %in% "ppt:ug/l") {
                     i$units <- "ug/l"
                   }

                   if (units_comp.scalar %in% "ph units:ph_units") {
                     i$units <- "ph_units"
                   }
                   if (units_comp.scalar %in% "NA:ug/l") {
                     i$units <- "ug/l"
                   }
                   if (units_comp.scalar %in% "NA:cfu/100ml") {
                     i$units <- "cfu/100ml"
                   }



                   if (!units_comp.scalar %in% c("mg/l:ug/l",
                                                 "ug/l:mg/l",
                                                 "ng/l:ug/l",
                                                 "ppt:ug/l",
                                                 "ph units:ph_units")) {
                     break.vec <- unlist(strsplit(units_comp.scalar, ":"))
                     if (length(unique(break.vec)) != 1) {
                       warning(paste0("Review required...",
                                      "\n",
                                     "\t Supplied: ", break.vec[1],
                                     "\n",
                                     "\t Required: ", break.vec[2],
                                     "\n"))
                     }
                   }

                  return(i)

                 })

prepped.df <- do.call(rbind, split.list)

Identify all of the parameters in nysdec_wqs that were not matched in smas.df. There will generally be a list of several parameter names that SMAS does not collect. This table should be manually compared against the observed values to ensure that the sample was truely not collected and not just mislabeled.

no_wqs_match.df <- prepped.df %>% 
  select(parameter, everything()) %>% 
  anti_join(stayCALM::nysdec_wqs, ., by = c("parameter", "fraction", "units")) %>% 
  select(parameter, fraction, units) %>% 
  distinct() %>% 
  arrange()

params.df <- prepped.df  %>% 
  select(parameter, fraction, units) %>% 
  distinct() %>% 
  arrange()

Add placeholder columns. depth is needed to be consistent with LMAS data; depth is not recorded by SMAS, and therefore this column is filled with NAs.

prepped.df$depth <- NA_real_

Subset the prepped.df to retain only the columns of interest.

keep.vec <- c("seg_id",
              "site_id",
              "sample_id", 
              "depth", # Adding depth to match LMAS columns
              "date",
              "fraction", "parameter", "value", "units",
              "quantitation_limit",
              "validator_qualifiers",
              "interpreted_qualifiers",
              "data_provider"
)

# keep.vec[!keep.vec %in% names(prepped.df)]
final_smas.df <- subset(prepped.df, select = keep.vec)

Remove rows that have been flagged as "r" in the validator_qualifiers column indicating that the values did not meet QA standards and should be removed ("r").

final_smas.df <- final_smas.df[!final_smas.df$validator_qualifiers %in% "r", ]
count.df <- final_smas.df %>% 
  group_by_all() %>% 
  mutate(n = n())
final_smas.df2 <- unique(final_smas.df)

Export SMAS Data

With the usethis package, the SMAS chemistry data is exported as a .rda file making it easily accessible during the development and testing of the stayCALM package.

smas.df <- final_smas.df
usethis::use_data(smas.df, overwrite = TRUE)


BWAM/stayCALM documentation built on May 21, 2020, 3:24 p.m.