knitr::opts_chunk$set(echo = TRUE)
The purpose of this RMarkdown document is to provide a streamlined, reproducible process for generating time series and aggregated bar chart images for use in documents, presentations, and TMDLs. Currently, it is catered toward E. coli data, but it could be adapted to accommodate other data sources as well.
The user will need to fill in the information below to run the rest of the script and generate the figures.
file_path = "em_creek.csv" au_name = "Emigration" aggFun ="gmean" # geometric mean correction_factor = 24465715 # correction factor for E. coli margin_of_safety = 0.1 # percentage rec_season=c(121,304) # days of the year irrigation_season=c(135,288) # days of the year site_order = data.frame(MonitoringLocationIdentifier = c( "BR_12.29", "BR_14.15", "BR_14.44", "EM_01.62", "EM_02.54", "EM_03.67", "EM_04.17", "EM_05.17", "EM_07.30", "EM_07.79", "EM_08.50", "EM_08.83", "EM_08.93", "EM_09.48", "EM_10.43", "EM_11.87", "KL_00.21", "KL_01.50" ),Order = c(1:18)) colorz = c("#f44336", "#e81e63", "#9c27b0", "#673ab7", "#3f51b5", "#2196f3", "#03a9f4", "#00bcd4", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc107", "#ff9800", "#ff5722", "#795548", "#9e9e9e", "#607d8b", "#000000") # colorz = c("#034963","#0b86a3","#00a1c6","#cde3e2", "#BFD7B5","#D1CAA1","#D1D2F9","#77625C","#EFA9AE") # these are HEX color codes. Feel free to lengthen, shorten, or change the colors based on your dataset
No need to change unless fundamental figures change.
list.of.packages <- c("plyr","dplyr","openxlsx","ggplot2","tidyr","lubridate","plotly","devtools") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) if(!("dwqInsights"%in%installed.packages()[,"Package"])) devtools::install_github("edhinman/dwqInsights") library(dplyr) library(openxlsx) library(lubridate) library(dwqInsights) colorsy=colorz[1:nrow(site_order)]
The function below aggregates data to a daily mean, determines which datapoints exceed the numeric criterion, groups data by recreation and irrigation seasons, and calculates observed and TMDL loading if flow is provided in the data. It produces an R data output that is ideal for plotting and figure production, as well as an output .csv for documentation and tracking purposes.
source("R/tmdlCalcs.R") output = tmdlCalcs(wb_path = file_path, aggFun = aggFun, cf = correction_factor, mos = margin_of_safety, rec_ssn = rec_season, irg_ssn = irrigation_season, exportfromfunc = TRUE) head(output)
The code below aggregates the output of tmdlCalcs further into site-specific, seasonal statistics. If the aggregating function provided above is the geometric mean (labeled "gmean"), all means will be calculated using the geometric mean.
gmean = function(x){exp(mean(log(x)))} dat = output dat_agg = output%>%group_by(MonitoringLocationIdentifier, CharacteristicName,ResultMeasure.MeasureUnitCode)%>%summarise(daterange = paste0(min(Date)," to ",max(Date)),samplesize=length(DailyResultMeasureValue),minconc = min(DailyResultMeasureValue),arithmean=mean(DailyResultMeasureValue),maxconc = max(DailyResultMeasureValue), perc_exc = sum(Exceeds)/length(Exceeds)*100) rec_mean = output%>%filter(Rec_Season=="rec")%>%group_by(MonitoringLocationIdentifier, CharacteristicName,ResultMeasure.MeasureUnitCode)%>%summarise(rec_ssn_arithmean=mean(DailyResultMeasureValue),rec_ssn_perc_exc = sum(Exceeds)/length(Exceeds)*100) dat_agg = merge(dat_agg, rec_mean, all = TRUE) if(aggFun=="gmean"){ dat_agg_geo = output%>%group_by(MonitoringLocationIdentifier, CharacteristicName, ResultMeasure.MeasureUnitCode)%>%summarise(geomean = gmean(DailyResultMeasureValue)) rec_geo = output%>%filter(Rec_Season=="rec")%>%group_by(MonitoringLocationIdentifier, CharacteristicName, ResultMeasure.MeasureUnitCode)%>%summarise(rec_ssn_geomean=gmean(DailyResultMeasureValue)) dat_agg_geo = merge(dat_agg_geo, rec_geo, all = TRUE) } dat_agg = merge(dat_agg, dat_agg_geo, all = TRUE) if("DailyFlowValue"%in%colnames(output)){ flo_agg = output%>%filter(!is.na(DailyFlowValue))%>%group_by(MonitoringLocationIdentifier, ResultMeasure.MeasureUnitCode)%>%summarise(daterange = paste0(min(Date)," to ",max(Date)),samplesize=length(DailyFlowValue),minconc = min(DailyFlowValue),arithmean=mean(DailyFlowValue),maxconc = max(DailyFlowValue)) dat_agg = plyr::rbind.fill(dat_agg, flo_agg) dat_agg$CharacteristicName[is.na(dat_agg$CharacteristicName)] = "Flow" } write.csv(dat_agg, paste0(au_name,"_summary_table_stats.csv"), row.names = FALSE) head(dat_agg)
This code block uses the site order specified at the beginning of this document to plot all samples collected at each site and show the overall sample mean or geometric mean. The points are randomly jittered for easier viewing of the data distribution/frequency of sample values. As such, re-rendering the plot may produce slightly different looking point locations, but the values should remain the same.
## GGPLOT sites = site_order[order(site_order$Order),] sites = as.character(sites$MonitoringLocationIdentifier) dat = output dat$MonitoringLocationIdentifier = factor(as.character(dat$MonitoringLocationIdentifier), levels = sites) f = ggplot(dat, aes(x=MonitoringLocationIdentifier,y=DailyResultMeasureValue))+geom_jitter(aes(fill=factor(MonitoringLocationIdentifier)), position=position_jitter(0.1), shape=21, size=2, color="#646464")+scale_fill_manual(values= colorsy)+theme_classic()+labs(x="Monitoring Location ID", y=unique(dat$ResultMeasure.MeasureUnitCode))+stat_summary(fun=gmean, geom="point", fill="#FFB800",color="black", size=2.5, shape=23)+geom_hline(yintercept=unique(dat$NumericCriterion), color="#cb181d",size=0.5)+theme(legend.position = "none", axis.text.x=element_text(angle=45, hjust=1)) # NOTE, should adjust numeric criterion line to accommodate variable criteria (e.g. hardness-dependent) f ggsave(paste0(au_name,"_downstream_upstream.jpg"),width=6,height=4, units="in", dpi=500)
This code block displays sample concentration timeseries by site, ordered downstream to upstream. It includes a horizontal line for the numeric criterion, as well as a grayed out background denoting the typical recreational season bounds.
dat = output dat = dat[order(dat$Date),] dat$MonitoringLocationIdentifier = factor(as.character(dat$MonitoringLocationIdentifier), levels = sites) t = ggplot(dat, aes(Date,DailyResultMeasureValue, fill=as.factor(MonitoringLocationIdentifier)))+scale_x_date(limits=c(min(dat$Date), max(dat$Date)))+geom_point(shape=21,color="#646464", size=2)+facet_wrap(vars(MonitoringLocationIdentifier), scales="free", ncol=1)+theme_classic()+labs(x="Date", y=unique(dat$ResultMeasure.MeasureUnitCode))+geom_hline(yintercept=unique(dat$NumericCriterion), color="#cb181d",size=0.5)+theme(legend.position = "none")+scale_fill_manual(values=colorsy) t ggsave(paste0(au_name,"_timeseries.jpg"),width=6,height=18, units="in", dpi=500)
This code block displays all samples at each site, grouped monthly with an overall monthly mean (geometric mean). The numeric criterion is also plotted on each panel.
dat = output dat$month = lubridate::month(dat$Date, label=TRUE) dat = dat[order(dat$month),] dat$MonitoringLocationIdentifier = factor(as.character(dat$MonitoringLocationIdentifier), levels = sites) g = ggplot(dat, aes(month, DailyResultMeasureValue, fill=factor(MonitoringLocationIdentifier)))+geom_blank()+geom_rect(aes(xmin=4.5,ymin=0,xmax=10.5,ymax=max(DailyResultMeasureValue)), fill="#D3D3D3")+scale_x_discrete(limits=month.abb)+geom_jitter(position=position_jitter(0), size=3, shape=21, color="#646464", alpha=0.65)+facet_wrap(vars(MonitoringLocationIdentifier), scales="free", ncol=1)+theme_classic()+labs(x="Month", y=unique(dat$ResultMeasure.MeasureUnitCode))+geom_hline(yintercept=unique(dat$NumericCriterion), color="#cb181d",size=0.5)+theme(legend.position = "none")+scale_fill_manual(values=colorsy)+stat_summary(fun=gmean, geom="point", fill="#FFB800",color="black", size=3, shape=23) g ggsave(paste0(au_name,"_monthly.jpg"),width=6,height=18, units="in", dpi=500)
This code block uses samples for which there is a paired flow value. For each monitoring location, a bar chart is produced with side by side mean Observed Loading and TMDL values for each month for which data are present.
if("TMDL"%in%colnames(output)){ ldc=output ldc_month = subset(ldc, !is.na(ldc$Observed_Loading)) ldc_month = ldc_month%>%select(Date,MonitoringLocationIdentifier,TMDL,Observed_Loading)%>%pivot_longer(cols=c(TMDL,Observed_Loading),names_to="Type",values_to="Loading",values_drop_na=TRUE) ldc_month$Loading_Giga = ldc_month$Loading/1000000000 ldc_month$month = lubridate::month(ldc_month$Date, label=TRUE, abbr=TRUE) ldc_month1 = ldc_month%>%group_by(MonitoringLocationIdentifier,Type,month)%>%summarise(mean_Load = mean(Loading_Giga)) ldcsites = unique(ldc_month1$MonitoringLocationIdentifier) for(i in 1:length(ldcsites)){ name = ldcsites[i] g = ggplot(ldc_month1, aes(x=month, y=mean_Load, fill=Type))+geom_blank()+theme_classic()+geom_rect(aes(xmin=4.5,ymin=0,xmax=10.5,ymax=max(ldc_month1$mean_Load*1.1)), fill="#D3D3D3")+scale_x_discrete(limits=month.abb)+labs(x="Month",y="GigaMPN/day")+geom_col(position="dodge", color="#646464")+scale_fill_manual(values=c("#00a1c6","#034963"),name="",breaks=c("Observed_Loading","TMDL"),labels=c("Observed","TMDL"))+guides(color="none") g ggsave(paste0(name,"_monthly_load.jpg"),width=8,height=4, units="in", dpi=500) } g }
if("TMDL"%in%colnames(output)){ ldc = output ldc$TMDL_giga = ldc$TMDL/1000000000 ldc$Observed_Loading_giga = ldc$Observed_Loading/1000000000 ldc = ldc[order(ldc$Flow_Percentile),] exceed = subset(ldc, !is.na(ldc$Observed_Loading_giga)) exceed = within(exceed,{ regime=NA regime[Flow_Percentile<10] = "High \nFlows" regime[Flow_Percentile>9&Flow_Percentile<40] = "Moist \nConditions" regime[Flow_Percentile>39&Flow_Percentile<60] = "Mid-Range \nFlows" regime[Flow_Percentile>59&Flow_Percentile<90] = "Dry \nConditions" regime[Flow_Percentile>89&Flow_Percentile<101] = "Low \nFlows" }) exceed = exceed%>%group_by(MonitoringLocationIdentifier, regime)%>%summarise(perc_exceed=round(length(Exceeds[Exceeds==1])/length(Exceeds)*100,digits=0)) exceed = within(exceed,{ place = NA place[regime=="High \nFlows"] =5 place[regime=="Moist \nConditions"] = 25 place[regime=="Mid-Range \nFlows"]=50 place[regime=="Dry \nConditions"] = 75 place[regime=="Low \nFlows"] = 95 }) exceed$label = paste0(exceed$regime,"\n(",exceed$perc_exceed,"%)") ldcsites = unique(exceed$MonitoringLocationIdentifier) for(i in 1:length(ldcsites)){ name = ldcsites[i] why = max(c(ldc$TMDL_giga,ldc$Observed_Loading_giga), na.rm = TRUE)*0.8 l = ggplot(ldc, aes(x=Flow_Percentile))+geom_blank()+geom_vline(xintercept=c(10,40,60,90),linetype=2)+geom_line(aes(y=TMDL_giga, color="TMDL_giga"),color="#034963",size=1.5)+geom_point(aes(y=Observed_Loading_giga, color="Observed_Loading_giga"),shape=21, color="#464646",fill="#00a1c6",size=3)+theme_classic()+labs(x="Flow Percentile",y="GigaMPN/day")+annotate("text",x=exceed$place,y=why, label=exceed$label)+scale_color_manual(name = "", values = c( "TMDL_giga" = "#034963", "Observed_Loading_giga" = "#00a1c6"), labels = c("TMDL", "Observed Loading")) l ggsave(paste0(name,"_ldc.jpg"),width=8,height=4, units="in", dpi=500) } l }
STILL WORKING ON THIS PART. SHOULD IT BE GAGE DATA OR DATA FROM THE SHEET?
flow = subset(output, !is.na(output$DailyFlowValue)) h=ggplot(flow, aes(x=Date,y=DailyFlowValue))+geom_area(color="#646464",fill="#0b86a3")+labs(x="Date",y="Flow (cfs)")+theme_classic() h ggsave(paste0(au_name,"_flow_ts.jpg"),width=8,height=4, units="in", dpi=500) flow$month = lubridate::month(flow$Date) flo_month = flow%>%group_by(month)%>%summarise(mean_monthly = mean(DailyFlowValue, na.rm=TRUE)) m = ggplot(flo_month, aes(x=month, y=mean_monthly))+geom_col(color="#646464",fill="#0b86a3")+labs(x="Month",y="Flow (cfs)")+theme_classic()+scale_x_discrete(limits=month.abb) m ggsave(paste0(au_name,"_flow_monthly.jpg"),width=8,height=4, units="in", dpi=500)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.