In edhinman/dwqInsights: Tools for water quality sample planning and analysis

knitr::opts_chunk$set(echo = TRUE)

Purpose

The purpose of this RMarkdown document is to provide a streamlined, reproducible process for generating time series and aggregated bar chart images for use in documents, presentations, and TMDLs. Currently, it is catered toward E. coli data, but it could be adapted to accommodate other data sources as well.

Input information

The user will need to fill in the information below to run the rest of the script and generate the figures.

file_path = "em_creek.csv"
au_name = "Emigration"
aggFun ="gmean" # geometric mean
correction_factor = 24465715 # correction factor for E. coli
margin_of_safety = 0.1 # percentage
rec_season=c(121,304) # days of the year
irrigation_season=c(135,288) # days of the year
site_order = data.frame(MonitoringLocationIdentifier = c(
  "BR_12.29",
  "BR_14.15",
  "BR_14.44",
  "EM_01.62",
  "EM_02.54",
  "EM_03.67",
  "EM_04.17",
  "EM_05.17",
  "EM_07.30",
  "EM_07.79",
  "EM_08.50",
  "EM_08.83",
  "EM_08.93",
  "EM_09.48",
  "EM_10.43",
  "EM_11.87",
  "KL_00.21",
  "KL_01.50"
),Order = c(1:18))
colorz = c("#f44336",
"#e81e63",
"#9c27b0",
"#673ab7",
"#3f51b5",
"#2196f3",
"#03a9f4",
"#00bcd4",
"#009688",
"#4caf50",
"#8bc34a",
"#cddc39",
"#ffeb3b",
"#ffc107",
"#ff9800",
"#ff5722",
"#795548",
"#9e9e9e",
"#607d8b",
"#000000")
# colorz = c("#034963","#0b86a3","#00a1c6","#cde3e2", "#BFD7B5","#D1CAA1","#D1D2F9","#77625C","#EFA9AE") # these are HEX color codes. Feel free to lengthen, shorten, or change the colors based on your dataset

Required packages

No need to change unless fundamental figures change.

list.of.packages <- c("plyr","dplyr","openxlsx","ggplot2","tidyr","lubridate","plotly","devtools")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
if(!("dwqInsights"%in%installed.packages()[,"Package"])) devtools::install_github("edhinman/dwqInsights")
library(dplyr)
library(openxlsx)
library(lubridate)
library(dwqInsights)
colorsy=colorz[1:nrow(site_order)]

Run tmdlCalc function

The function below aggregates data to a daily mean, determines which datapoints exceed the numeric criterion, groups data by recreation and irrigation seasons, and calculates observed and TMDL loading if flow is provided in the data. It produces an R data output that is ideal for plotting and figure production, as well as an output .csv for documentation and tracking purposes.

source("R/tmdlCalcs.R")
output = tmdlCalcs(wb_path = file_path, aggFun = aggFun, cf = correction_factor, mos = margin_of_safety, rec_ssn = rec_season, irg_ssn = irrigation_season, exportfromfunc = TRUE)
head(output)

Create Aggregation Table Stats

The code below aggregates the output of tmdlCalcs further into site-specific, seasonal statistics. If the aggregating function provided above is the geometric mean (labeled "gmean"), all means will be calculated using the geometric mean.

gmean = function(x){exp(mean(log(x)))}
dat = output
dat_agg = output%>%group_by(MonitoringLocationIdentifier, CharacteristicName,ResultMeasure.MeasureUnitCode)%>%summarise(daterange = paste0(min(Date)," to ",max(Date)),samplesize=length(DailyResultMeasureValue),minconc = min(DailyResultMeasureValue),arithmean=mean(DailyResultMeasureValue),maxconc = max(DailyResultMeasureValue), perc_exc = sum(Exceeds)/length(Exceeds)*100)
rec_mean = output%>%filter(Rec_Season=="rec")%>%group_by(MonitoringLocationIdentifier, CharacteristicName,ResultMeasure.MeasureUnitCode)%>%summarise(rec_ssn_arithmean=mean(DailyResultMeasureValue),rec_ssn_perc_exc = sum(Exceeds)/length(Exceeds)*100)
dat_agg = merge(dat_agg, rec_mean, all = TRUE)
if(aggFun=="gmean"){
  dat_agg_geo = output%>%group_by(MonitoringLocationIdentifier, CharacteristicName, ResultMeasure.MeasureUnitCode)%>%summarise(geomean = gmean(DailyResultMeasureValue))
  rec_geo = output%>%filter(Rec_Season=="rec")%>%group_by(MonitoringLocationIdentifier, CharacteristicName, ResultMeasure.MeasureUnitCode)%>%summarise(rec_ssn_geomean=gmean(DailyResultMeasureValue))
  dat_agg_geo = merge(dat_agg_geo, rec_geo, all = TRUE)
}
dat_agg = merge(dat_agg, dat_agg_geo, all = TRUE)
if("DailyFlowValue"%in%colnames(output)){
  flo_agg = output%>%filter(!is.na(DailyFlowValue))%>%group_by(MonitoringLocationIdentifier, ResultMeasure.MeasureUnitCode)%>%summarise(daterange = paste0(min(Date)," to ",max(Date)),samplesize=length(DailyFlowValue),minconc = min(DailyFlowValue),arithmean=mean(DailyFlowValue),maxconc = max(DailyFlowValue))
  dat_agg = plyr::rbind.fill(dat_agg, flo_agg)
  dat_agg$CharacteristicName[is.na(dat_agg$CharacteristicName)] = "Flow"
}
write.csv(dat_agg, paste0(au_name,"_summary_table_stats.csv"), row.names = FALSE)
head(dat_agg)

Upstream/Downstream

This code block uses the site order specified at the beginning of this document to plot all samples collected at each site and show the overall sample mean or geometric mean. The points are randomly jittered for easier viewing of the data distribution/frequency of sample values. As such, re-rendering the plot may produce slightly different looking point locations, but the values should remain the same.

## GGPLOT
sites = site_order[order(site_order$Order),]
sites = as.character(sites$MonitoringLocationIdentifier)
dat = output
dat$MonitoringLocationIdentifier = factor(as.character(dat$MonitoringLocationIdentifier), levels = sites)

f = ggplot(dat, aes(x=MonitoringLocationIdentifier,y=DailyResultMeasureValue))+geom_jitter(aes(fill=factor(MonitoringLocationIdentifier)), position=position_jitter(0.1), shape=21, size=2, color="#646464")+scale_fill_manual(values= colorsy)+theme_classic()+labs(x="Monitoring Location ID", y=unique(dat$ResultMeasure.MeasureUnitCode))+stat_summary(fun=gmean, geom="point", fill="#FFB800",color="black", size=2.5, shape=23)+geom_hline(yintercept=unique(dat$NumericCriterion), color="#cb181d",size=0.5)+theme(legend.position = "none", axis.text.x=element_text(angle=45, hjust=1)) # NOTE, should adjust numeric criterion line to accommodate variable criteria (e.g. hardness-dependent)
f
ggsave(paste0(au_name,"_downstream_upstream.jpg"),width=6,height=4, units="in", dpi=500)

Timeseries

This code block displays sample concentration timeseries by site, ordered downstream to upstream. It includes a horizontal line for the numeric criterion, as well as a grayed out background denoting the typical recreational season bounds.

dat = output
dat = dat[order(dat$Date),]
dat$MonitoringLocationIdentifier = factor(as.character(dat$MonitoringLocationIdentifier), levels = sites)

t = ggplot(dat, aes(Date,DailyResultMeasureValue, fill=as.factor(MonitoringLocationIdentifier)))+scale_x_date(limits=c(min(dat$Date), max(dat$Date)))+geom_point(shape=21,color="#646464", size=2)+facet_wrap(vars(MonitoringLocationIdentifier), scales="free", ncol=1)+theme_classic()+labs(x="Date", y=unique(dat$ResultMeasure.MeasureUnitCode))+geom_hline(yintercept=unique(dat$NumericCriterion), color="#cb181d",size=0.5)+theme(legend.position = "none")+scale_fill_manual(values=colorsy)
t
ggsave(paste0(au_name,"_timeseries.jpg"),width=6,height=18, units="in", dpi=500)

Monthly concentration plots

This code block displays all samples at each site, grouped monthly with an overall monthly mean (geometric mean). The numeric criterion is also plotted on each panel.

dat = output
dat$month = lubridate::month(dat$Date, label=TRUE)
dat = dat[order(dat$month),]
dat$MonitoringLocationIdentifier = factor(as.character(dat$MonitoringLocationIdentifier), levels = sites)

g = ggplot(dat, aes(month, DailyResultMeasureValue, fill=factor(MonitoringLocationIdentifier)))+geom_blank()+geom_rect(aes(xmin=4.5,ymin=0,xmax=10.5,ymax=max(DailyResultMeasureValue)), fill="#D3D3D3")+scale_x_discrete(limits=month.abb)+geom_jitter(position=position_jitter(0), size=3, shape=21, color="#646464", alpha=0.65)+facet_wrap(vars(MonitoringLocationIdentifier), scales="free", ncol=1)+theme_classic()+labs(x="Month", y=unique(dat$ResultMeasure.MeasureUnitCode))+geom_hline(yintercept=unique(dat$NumericCriterion), color="#cb181d",size=0.5)+theme(legend.position = "none")+scale_fill_manual(values=colorsy)+stat_summary(fun=gmean, geom="point", fill="#FFB800",color="black", size=3, shape=23)
g
ggsave(paste0(au_name,"_monthly.jpg"),width=6,height=18, units="in", dpi=500)

Monthly Observed vs TMDL Loading

This code block uses samples for which there is a paired flow value. For each monitoring location, a bar chart is produced with side by side mean Observed Loading and TMDL values for each month for which data are present.

if("TMDL"%in%colnames(output)){
  ldc=output
ldc_month = subset(ldc, !is.na(ldc$Observed_Loading))
ldc_month = ldc_month%>%select(Date,MonitoringLocationIdentifier,TMDL,Observed_Loading)%>%pivot_longer(cols=c(TMDL,Observed_Loading),names_to="Type",values_to="Loading",values_drop_na=TRUE)
ldc_month$Loading_Giga = ldc_month$Loading/1000000000
ldc_month$month = lubridate::month(ldc_month$Date, label=TRUE, abbr=TRUE)

ldc_month1 = ldc_month%>%group_by(MonitoringLocationIdentifier,Type,month)%>%summarise(mean_Load = mean(Loading_Giga))

ldcsites = unique(ldc_month1$MonitoringLocationIdentifier)
for(i in 1:length(ldcsites)){
  name = ldcsites[i]
  g = ggplot(ldc_month1, aes(x=month, y=mean_Load, fill=Type))+geom_blank()+theme_classic()+geom_rect(aes(xmin=4.5,ymin=0,xmax=10.5,ymax=max(ldc_month1$mean_Load*1.1)), fill="#D3D3D3")+scale_x_discrete(limits=month.abb)+labs(x="Month",y="GigaMPN/day")+geom_col(position="dodge", color="#646464")+scale_fill_manual(values=c("#00a1c6","#034963"),name="",breaks=c("Observed_Loading","TMDL"),labels=c("Observed","TMDL"))+guides(color="none")
  g
  ggsave(paste0(name,"_monthly_load.jpg"),width=8,height=4, units="in", dpi=500)
}
g
}

Load Duration Curve

if("TMDL"%in%colnames(output)){
  ldc = output
ldc$TMDL_giga = ldc$TMDL/1000000000
ldc$Observed_Loading_giga = ldc$Observed_Loading/1000000000
ldc = ldc[order(ldc$Flow_Percentile),]

exceed = subset(ldc, !is.na(ldc$Observed_Loading_giga))
exceed = within(exceed,{
  regime=NA
  regime[Flow_Percentile<10] = "High \nFlows"
  regime[Flow_Percentile>9&Flow_Percentile<40] = "Moist \nConditions"
  regime[Flow_Percentile>39&Flow_Percentile<60] = "Mid-Range \nFlows"
  regime[Flow_Percentile>59&Flow_Percentile<90] = "Dry \nConditions"
  regime[Flow_Percentile>89&Flow_Percentile<101] = "Low \nFlows"
})

exceed = exceed%>%group_by(MonitoringLocationIdentifier, regime)%>%summarise(perc_exceed=round(length(Exceeds[Exceeds==1])/length(Exceeds)*100,digits=0))

exceed = within(exceed,{
  place = NA
  place[regime=="High \nFlows"] =5
  place[regime=="Moist \nConditions"] = 25
  place[regime=="Mid-Range \nFlows"]=50
  place[regime=="Dry \nConditions"] = 75
  place[regime=="Low \nFlows"] = 95
})

exceed$label = paste0(exceed$regime,"\n(",exceed$perc_exceed,"%)")
ldcsites = unique(exceed$MonitoringLocationIdentifier)
for(i in 1:length(ldcsites)){
  name = ldcsites[i]
  why = max(c(ldc$TMDL_giga,ldc$Observed_Loading_giga), na.rm = TRUE)*0.8
  l = ggplot(ldc, aes(x=Flow_Percentile))+geom_blank()+geom_vline(xintercept=c(10,40,60,90),linetype=2)+geom_line(aes(y=TMDL_giga, color="TMDL_giga"),color="#034963",size=1.5)+geom_point(aes(y=Observed_Loading_giga, color="Observed_Loading_giga"),shape=21, color="#464646",fill="#00a1c6",size=3)+theme_classic()+labs(x="Flow Percentile",y="GigaMPN/day")+annotate("text",x=exceed$place,y=why, label=exceed$label)+scale_color_manual(name = "",
values = c( "TMDL_giga" = "#034963", "Observed_Loading_giga" = "#00a1c6"),
labels = c("TMDL", "Observed Loading"))
l
ggsave(paste0(name,"_ldc.jpg"),width=8,height=4, units="in", dpi=500)
}
l
}

Flow timeseries

STILL WORKING ON THIS PART. SHOULD IT BE GAGE DATA OR DATA FROM THE SHEET?

flow = subset(output, !is.na(output$DailyFlowValue))
h=ggplot(flow, aes(x=Date,y=DailyFlowValue))+geom_area(color="#646464",fill="#0b86a3")+labs(x="Date",y="Flow (cfs)")+theme_classic()
h
ggsave(paste0(au_name,"_flow_ts.jpg"),width=8,height=4, units="in", dpi=500)

flow$month = lubridate::month(flow$Date)
flo_month = flow%>%group_by(month)%>%summarise(mean_monthly = mean(DailyFlowValue, na.rm=TRUE))

m = ggplot(flo_month, aes(x=month, y=mean_monthly))+geom_col(color="#646464",fill="#0b86a3")+labs(x="Month",y="Flow (cfs)")+theme_classic()+scale_x_discrete(limits=month.abb)
m
ggsave(paste0(au_name,"_flow_monthly.jpg"),width=8,height=4, units="in", dpi=500)