Introduction

This documentation describes how water chemistry data collected by the New York State Department of Environmental Conservation's (NYSDEC's) Stream Monitoring and Assessment Section (SMAS) is prepared for assessment using the stayCALM R-package. Much of this work will not be necessary once the database is finalized. There will still be some amount of data preparation, but it should not be as extensive.

Preprocessing

Prepare the R Environment

Load the necessary R packages into the global environment.

library(tidyverse)
library(readxl)
data(nysdec_wqs)
standard_dt <- function(x) {
    DT::datatable(x, 
    options = list(
      columnDefs = list(list(
        className = 'dt-center'
      )),
      scrollY = 300,
      scroller = TRUE,
      scrollX = TRUE
    )
  )
}

Load SMAS Chemistry Data

EQuIS Data

The code below defines the file path to the SMAS chemistry data historically stored in an EQuIS database.

smas.path <- file.path(here::here(),
                       "data-raw",
                       "smas")

Import EQuIS sample information and result data.

# location.df <- vroom::vroom(file.path(equis.path, "equis_location_history.csv"))

sample.df <- vroom::vroom(file.path(smas.path,
                                    "equis_sample_history.csv")) %>% 
  select(SYS_SAMPLE_CODE, SAMPLE_DELIVERY_GROUP, SAMPLE_NAME,
         SAMPLE_TYPE_CODE, SAMPLE_DATE, DATA_PROVIDER)

result.df <- vroom::vroom(file.path(smas.path, "equis_results_history.csv")) %>% 
  select(SYS_SAMPLE_CODE, LAB_SDG, LAB_ANL_METHOD_NAME, CAS_RN, FRACTION, CHEMICAL_NAME,
         RESULT_NUMERIC, RESULT_UNIT,
         METHOD_DETECTION_LIMIT, REPORTING_DETECTION_LIMIT, VALIDATION_LEVEL,
         VALIDATOR_QUALIFIERS, INTERPRETED_QUALIFIERS, QUANTITATION_LIMIT,
         ANALYSIS_DATE)  %>% 
  mutate(VALIDATOR_QUALIFIERS = as.character(VALIDATOR_QUALIFIERS))

Join the EQuIS sample information and result data by "SYS_SAMPLE_CODE".

streams_chem.df <- inner_join(sample.df, result.df,
                              by = c("SYS_SAMPLE_CODE")#,
                                     # "SAMPLE_DELIVERY_GROUP" = "LAB_SDG")
                              ) %>% 
  mutate(ADJUST_RIBS_ID = str_remove_all(SYS_SAMPLE_CODE, "\\-.*|\\_.*")) 

test <- streams_chem.df %>% 
  group_by(SYS_SAMPLE_CODE, SAMPLE_DATE, CHEMICAL_NAME, FRACTION, RESULT_NUMERIC) %>% 
  summarize(n = n()) %>% 
  filter(n > 1)

Cross-Walk

To join the SMAS chemistry data with the Assessment database, each SMAS sample must have a Waterbody Inventory and Priority Waterbody Listing (WI/PWL) ID. SMAS has associated all sampling locations with a WI/PWL ID. However to be more descriptive, SMAS's sample naming convention changed recently, and therefore the WI/PWL ID cannot be directly linked to the sample IDs in the EQuIS database. The new sample names must be cross-walked with the EQuIS sample names to correctly assign the WI/PWL ID.

Import the data with the new sample IDs and the associated WI/PWL IDs.

streams_sites.df <- file.path(smas.path,
                              "cleaned",
                              "20191224_Site_Field_cleaned_final.xlsx") %>% 
  readxl::read_excel(sheet = "Sheet1") %>% 
  select(SH_SITE_ID, SH_PWL_ID) %>% 
  distinct()

Import the data that provides the necessary information to perform the cross-walk between the EQuIS database and the new sample IDs.

streams_crosswalk.df <- file.path(smas.path,
                                  "RIBSchem_historic_BioSiteJoin_2019-12-18.xlsx"
                                  )  %>% 
  readxl::read_excel(sheet = "Sheet1") %>% 
  select(RIBS_ID_FOR_SITE_TABLE:Station_RIBS, SBU_ID) %>% 
  distinct()

Perform the cross-walk to assign the WI/PWL ID to the EQuIS data.

crosswalk.df <- inner_join(streams_chem.df, streams_crosswalk.df,
                     by = "ADJUST_RIBS_ID") %>% 
  inner_join(streams_sites.df, by = c("SBU_ID" = "SH_SITE_ID"))

Validation

Confirm that the appropriate chemical name is assigned and that date columns are properly formatted.

clean_up.df <- crosswalk.df %>% 
  mutate(CHEMICAL_NAME = case_when(
    CHEMICAL_NAME %in% "Fecal Coliform" ~ "Fecal Coliforms",
    CHEMICAL_NAME %in% "Coliform" ~ "Total Coliform",
    CHEMICAL_NAME %in% "Nitrogen, Ammonia (As N)" ~ "Ammonia",
    CHEMICAL_NAME %in% "Nitrogen, Nitrate (As N)" ~ "Nitrate",
    CHEMICAL_NAME %in% "Chloride (As Cl)" ~ "Chloride",
    CHEMICAL_NAME %in% "Total Dissolved Solids (Residue, Filterable)" ~ " Total Dissolved Solids",
    TRUE ~ CHEMICAL_NAME
  )) %>% 
   mutate(SAMPLE_DATE = if_else(nchar(SAMPLE_DATE) > 10,
                                lubridate::mdy_hms(SAMPLE_DATE,
                                                   quiet = TRUE,
                                                   tz = "America/New_York"),
                                as.POSIXct(lubridate::mdy(SAMPLE_DATE,
                                                          quiet = TRUE,
                                                          tz = "America/New_York"))),
         ANALYSIS_DATE = if_else(nchar(ANALYSIS_DATE) > 10,
                                lubridate::mdy_hms(ANALYSIS_DATE,
                                                   quiet = TRUE,
                                                   tz = "America/New_York"),
                                as.POSIXct(lubridate::mdy(ANALYSIS_DATE,
                                                          quiet = TRUE,
                                                          tz = "America/New_York"))),
         date = as.Date(SAMPLE_DATE)) 

Drop the ANALYSIS_DATE column because it is creating duplicates.

stream_chem.df <- clean_up.df %>% 
  select(-ANALYSIS_DATE) %>% 
  distinct()

Duplicates

Confirm that there are no duplicate chemicals reported for the same sample. REQUIRES ADDITIONAL ATTENTION.

dups.df <- stream_chem.df %>%
  group_by_at(vars(-RESULT_NUMERIC, -RESULT_UNIT)) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  filter(n > 1) %>%
  select(n, everything()) %>%
  arrange(CHEMICAL_NAME, SYS_SAMPLE_CODE)

vroom::vroom_write(dups.df, file.path(here::here(), "data-raw", "dups.csv"), delim = ",")
#test <- inner_join(dups.df, stream_chem.df)
standard_dt(dups.df)

Final Formatting

smas.df <- stream_chem.df %>% 
  rename_all(tolower) %>% 
  rename(seg_id = sh_pwl_id,
         units = result_unit) %>% 
  mutate(parameter = chemical_name %>% tolower() %>% trimws() %>% gsub(" ", "_", .),
         parameter = case_when(
           parameter %in% "hardness_(as_caco3)" ~ "hardness",
           parameter %in% c("nitrogen,_nitrate-nitrite",
                            "nitrogen,_nitrite_+_nitrate") ~ "nitrate_nitrite",
           parameter %in% "sulfate_(as_so4)" ~ "sulfate",
           TRUE ~ parameter
         )) %>% 
  mutate(fraction = case_when(
           fraction %in% "T" ~ "total",
           fraction %in% "D" ~ "dissolved",
           is.na(fraction) & parameter %in% c("ph",
                                              "total_coliforms",
                                              "fecal_coliforms") ~ "total",
           is.na(fraction) & parameter %in% c("dissolved_oxygen",
                                              "total_dissolved_solids") ~ "dissolved",
           TRUE ~ "ERROR"
         ),
         value = if_else(is.na(result_numeric) & !is.na(validation_level),
                                  reporting_detection_limit,
                                  result_numeric))  %>% 
  mutate(sample_id = gsub(pattern = "_[^_]+$",
                              replacement = "",
                              x = sys_sample_code)) %>% 
  select(sample_id, everything())
smas.df <- mutate_if(smas.df, is.character, tolower)
smas.df$units <- with(smas.df, ifelse(parameter %in% "ammonia" & is.na(units),
                                "mg/l",
                                units))
keep.vec <- c("parameter", "fraction", "units")

merged.df <- merge(unique(smas.df[keep.vec]),
                   unique(nysdec_wqs[keep.vec]),
                   by = c("parameter", "fraction"))

merged.df$units_comp <- paste(merged.df$units.x, merged.df$units.y, sep = ":")

names(merged.df)[names(merged.df) %in% "units.x"] <- "units"

units_comp.df <- merge(smas.df, merged.df, by = keep.vec,
                       all.x = TRUE)

units_comp.df$units_comp <- ifelse(is.na(units_comp.df$units_comp),
                                   "no_match",
                                   units_comp.df$units_comp)

split.list <- by(units_comp.df, units_comp.df$units_comp,
                 function(i) {
                   units_comp.scalar <- unique(i$units_comp)
                   if (units_comp.scalar %in% "mg/l:ug/l") {
                     i$value <- i$value * 1000
                     i$units <- "ug/l"
                   }
                   if (units_comp.scalar %in% "ug/l:mg/l") {
                     i$value <- i$value / 1000
                     i$units <- "mg/l"
                   }
                   if (units_comp.scalar %in% "ng/l:ug/l") {
                     i$value <- i$value / 1000
                     i$units <- "ug/l"
                   }
                   if (units_comp.scalar %in% "ppt:ug/l") {
                     i$units <- "ug/l"
                   }

                   if (units_comp.scalar %in% "ph units:ph_units") {
                     i$units <- "ph_units"
                   }
                   if (units_comp.scalar %in% "NA:ug/l") {
                     i$units <- "ug/l"
                   }
                   if (units_comp.scalar %in% "NA:cfu/100ml") {
                     i$units <- "cfu/100ml"
                   }



                   if (!units_comp.scalar %in% c("mg/l:ug/l",
                                                 "ug/l:mg/l",
                                                 "ng/l:ug/l",
                                                 "ppt:ug/l",
                                                 "ph units:ph_units")) {
                     break.vec <- unlist(strsplit(units_comp.scalar, ":"))
                     if (length(unique(break.vec)) != 1) {
                       warning(paste0("Review required...",
                                      "\n",
                                     "\t Supplied: ", break.vec[1],
                                     "\n",
                                     "\t Required: ", break.vec[2],
                                     "\n"))
                     }
                   }

                  return(i)

                 })

prepped.df <- do.call(rbind, split.list)
no_wqs_match.df <- prepped.df %>% 
  select(parameter, everything()) %>% 
  anti_join(stayCALM::nysdec_wqs, ., by = c("parameter", "fraction", "units")) %>% 
  select(parameter, fraction, units) %>% 
  distinct()
# Adding depth to match LMAS columns
prepped.df$depth <- NA_real_
prepped.df$water_type <- "flow"
keep.vec <- c("seg_id",
              # "water_type",
              "sample_id", 
              "depth",
              "date",
              "fraction", "parameter", "value", "units",
              "quantitation_limit",
              "validator_qualifiers",
              "interpreted_qualifiers",
              "data_provider"
)

final_smas.df <- subset(prepped.df, select = keep.vec)
final_smas.df <- final_smas.df[!final_smas.df$interpreted_qualifiers %in% "r", ]
final_smas.df$value <- ifelse(final_smas.df$interpreted_qualifiers %in% "u",
                              as.numeric(final_smas.df$quantitation_limit),
                              final_smas.df$value)

Export SMAS Data

With the usethis package, the SMAS chemistry data is exported as a .rda file making it easily accessible during the development and testing of the stayCALM package.

smas.df <- final_smas.df
usethis::use_data(smas.df, overwrite = TRUE)


BWAM/stayCALM documentation built on May 21, 2020, 3:24 p.m.