This documentation describes how water chemistry data collected by the New York State Department of Environmental Conservation's (NYSDEC's) Stream Monitoring and Assessment Section (SMAS) is prepared for assessment using the stayCALM R-package. Much of this work will not be necessary once the database is finalized. There will still be some amount of data preparation, but it should not be as extensive.
Load the necessary R packages into the global environment.
library(tidyverse) library(readxl) library(stayCALM)
Import the water quality standards (WQS) table from the stayCALM package.
data(nysdec_wqs)
standard_dt <- function(x) { DT::datatable(x, options = list( columnDefs = list(list( className = 'dt-center' )), scrollY = 300, scroller = TRUE, scrollX = TRUE ) ) }
The code below defines the file path to the SMAS chemistry data provided by G. Lemley.
smas.path <- file.path(here::here(), "data-raw", "smas", "2018-2019", "qa")
Import SMAS sample information and result data.
sample.df <- vroom::vroom(file.path(smas.path, "2020_05_01_S_Site.csv")) %>% rename_all(tolower) %>% select(-site_id) %>% rename(site_id = site_history_id) result.df <- vroom::vroom(file.path(smas.path, "SMAS_chem_2018-2018_flagged_simple_2020-05-13.csv")) %>% rename_all(tolower) %>% mutate(validator_qualifiers = as.character(validator_qualifiers)) insitu.df <- vroom::vroom(file.path(smas.path, "S_IN_SITU_WATER_CHEM_2020_05_04.csv")) %>% rename_all(tolower)
prep_insitu_1.df <- insitu.df %>% select(event_smas_id, chemical_name = parameter_name, result_value = result, result_unit = unit) %>% distinct() %>% separate(event_smas_id, c("site_id", "date"), sep = "_(?=[^_]+$)") %>% mutate(date = as.Date(date, "%Y%m%d")) %>% filter(date >= as.Date("2018-01-01")) seg_id.df <- unique(subset(sample.df, select = c("site_id", "site_pwl_id"))) prep_insitu.df <- left_join(prep_insitu_1.df, seg_id.df, "site_id") %>% mutate_if(is.character, tolower) # result.df <- result.df %>% # mutate(sample_date = if_else(nchar(sample_date) > 10, # lubridate::mdy_hms(sample_date, # quiet = TRUE, # tz = "America/New_York"), # as.POSIXct(lubridate::mdy(sample_date, # quiet = TRUE, # tz = "America/New_York"))), # date = as.Date(sample_date)) # # check_join.df <- anti_join(result.df, prep_insitu.df, by = c("site_id", "date"))
Join the EQuIS sample information and result data by "SYS_SAMPLE_CODE".
test <- anti_join(result.df, sample.df, by = "site_id") streams_chem.df <- left_join(result.df, sample.df, by = c("site_id"))
Check streams_chem.df
for duplicate values.
dup_test.df <- streams_chem.df %>% group_by(sys_sample_code, sample_date, chemical_name, fraction, result_value) %>% summarize(n = n()) %>% filter(n > 1) if (nrow(dup_test.df) > 0) stop("Check `streams_chem.df` for duplicates.")
Rename the reported chemical_name
strings in streams_chem.df
to match the parameter
strings in the WQS table (nysdec_wqs
).
clean_up.df <- streams_chem.df %>% mutate(chemical_name = case_when( chemical_name %in% "chloride (as cl)" ~ "chloride", chemical_name %in% "coliform" ~ "total_coliform", chemical_name %in% "fecal coliform" ~ "fecal_coliforms", chemical_name %in% "fluoride, undistilled" ~ "fluoride", chemical_name %in% "hardness (as caco3)" ~ "hardness", chemical_name %in% "nitrate+nitrite as nitrogen" ~ "nitrate_nitrite", chemical_name %in% "nitrogen, ammonia (as n)" ~ "ammonia", chemical_name %in% "nitrogen, nitrate (as n)" ~ "nitrate", chemical_name %in% "nitrogen, nitrite" ~ "nitrite", chemical_name %in% "phosphorus, total (as p)" ~ "phosphorus", chemical_name %in% "sulfate (as so4)" ~ "sulfate", chemical_name %in% "total dissolved solids (residue, filterable)" ~ "total_dissolved_solids", TRUE ~ chemical_name ))
Clean up sample dates and create a date
column without the sample time. Sample time is not necessary for assessment purposes.
clean_up.df <- clean_up.df %>% mutate(sample_date = if_else(nchar(sample_date) > 10, lubridate::mdy_hms(sample_date, quiet = TRUE, tz = "America/New_York"), as.POSIXct(lubridate::mdy(sample_date, quiet = TRUE, tz = "America/New_York"))), date = as.Date(sample_date))
Append the in situ data to the chemical data.
clean_up.df <- bind_rows(clean_up.df, prep_insitu.df)
Confirm that there are no duplicate chemicals reported for the same sample.
dups.df <- clean_up.df %>% group_by_at(vars(-result_value, -result_unit)) %>% mutate(n = n()) %>% ungroup() %>% filter(n > 1) %>% select(n, everything()) %>% arrange(chemical_name, sys_sample_code) if (nrow(dups.df) > 0 ) { vroom::vroom_write(dups.df, file.path(here::here(), "data-raw", "dups.csv"), delim = ",") stop("Check dups.df for duplicate values.") }
Rename multiple columns to the standard naming convention required by the stayCALM package.
smas.df <- clean_up.df %>% rename(seg_id = site_pwl_id, units = result_unit, parameter = chemical_name, data_provider = project_name)
Create a sample_id
column representing the concetenation of site_id
and date
to enable aggregation by individual samples.
smas.df$sample_id <- paste(smas.df$site_id, smas.df$date, sep = "_")
Reformat the reported chemical fraction
, which should be "total" or "dissolved."
smas.df <- smas.df %>% mutate(fraction = case_when( fraction %in% "T" ~ "total", fraction %in% "D" ~ "dissolved", is.na(fraction) & parameter %in% c("ph", "total_coliforms", "fecal_coliforms") ~ "total", parameter %in% c("dissolved_oxygen", "total_dissolved_solids") ~ "dissolved", TRUE ~ "ERROR" ))
Create a value
column that represents reported result_value
when available and the quantitation_limit
when the parameter is reported as undetected. Using the quantitation_limit
provides a numeric value that can be compared against a WQS. Although the quantitation_limit
may not represent an accurate value for a given sample, it is accurate enough to compare against a WQS; there should be no WQSs below quantitation_limit
, and therefore all undetected parameters should be attaining WQSs.
smas.df$value <- ifelse(smas.df$validator_qualifiers %in% "U", as.numeric(smas.df$quantitation_limit), smas.df$result_value)
Convert all character values in smas.df
to lowercase. This makes it easier to work with the data and prevents miss-matching due to case-sensitivity.
smas.df <- mutate_if(smas.df, is.character, tolower)
Ensure that the parameter units in smas.df
match the units of the WQS in nysdec_wqs
.
keep.vec <- c("parameter", "fraction", "units") merged.df <- merge(unique(smas.df[keep.vec]), unique(nysdec_wqs[keep.vec]), by = c("parameter", "fraction")) merged.df$units_comp <- paste(merged.df$units.x, merged.df$units.y, sep = ":") names(merged.df)[names(merged.df) %in% "units.x"] <- "units" units_comp.df <- merge(smas.df, merged.df, by = keep.vec, all.x = TRUE) units_comp.df$units_comp <- ifelse(is.na(units_comp.df$units_comp), "no_match", units_comp.df$units_comp) split.list <- by(units_comp.df, units_comp.df$units_comp, function(i) { units_comp.scalar <- unique(i$units_comp) if (units_comp.scalar %in% "mg/l:ug/l") { i$value <- i$value * 1000 i$units <- "ug/l" } if (units_comp.scalar %in% "ug/l:mg/l") { i$value <- i$value / 1000 i$units <- "mg/l" } if (units_comp.scalar %in% "ng/l:ug/l") { i$value <- i$value / 1000 i$units <- "ug/l" } if (units_comp.scalar %in% "ppt:ug/l") { i$units <- "ug/l" } if (units_comp.scalar %in% "ph units:ph_units") { i$units <- "ph_units" } if (units_comp.scalar %in% "NA:ug/l") { i$units <- "ug/l" } if (units_comp.scalar %in% "NA:cfu/100ml") { i$units <- "cfu/100ml" } if (!units_comp.scalar %in% c("mg/l:ug/l", "ug/l:mg/l", "ng/l:ug/l", "ppt:ug/l", "ph units:ph_units")) { break.vec <- unlist(strsplit(units_comp.scalar, ":")) if (length(unique(break.vec)) != 1) { warning(paste0("Review required...", "\n", "\t Supplied: ", break.vec[1], "\n", "\t Required: ", break.vec[2], "\n")) } } return(i) }) prepped.df <- do.call(rbind, split.list)
Identify all of the parameters in nysdec_wqs
that were not matched in smas.df
. There will generally be a list of several parameter names that SMAS does not collect. This table should be manually compared against the observed values to ensure that the sample was truely not collected and not just mislabeled.
no_wqs_match.df <- prepped.df %>% select(parameter, everything()) %>% anti_join(stayCALM::nysdec_wqs, ., by = c("parameter", "fraction", "units")) %>% select(parameter, fraction, units) %>% distinct() %>% arrange() params.df <- prepped.df %>% select(parameter, fraction, units) %>% distinct() %>% arrange()
Add placeholder columns. depth
is needed to be consistent with LMAS data; depth
is not recorded by SMAS, and therefore this column is filled with NA
s.
prepped.df$depth <- NA_real_
Subset the prepped.df
to retain only the columns of interest.
keep.vec <- c("seg_id", "site_id", "sample_id", "depth", # Adding depth to match LMAS columns "date", "fraction", "parameter", "value", "units", "quantitation_limit", "validator_qualifiers", "interpreted_qualifiers", "data_provider" ) # keep.vec[!keep.vec %in% names(prepped.df)] final_smas.df <- subset(prepped.df, select = keep.vec)
Remove rows that have been flagged as "r" in the validator_qualifiers
column indicating that the values did not meet QA standards and should be removed ("r").
final_smas.df <- final_smas.df[!final_smas.df$validator_qualifiers %in% "r", ]
count.df <- final_smas.df %>% group_by_all() %>% mutate(n = n()) final_smas.df2 <- unique(final_smas.df)
With the usethis package, the SMAS chemistry data is exported as a .rda file making it easily accessible during the development and testing of the stayCALM package.
smas.df <- final_smas.df usethis::use_data(smas.df, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.