library(CSLSscenarios)
library(dplyr)
library(reshape2)
library(ggplot2)
library(extrafont)
library(lubridate)
library(DT)


rules            <- CSLSscenarios::ecological_rules %>%
                    select(.data$hydrology, .data$metric, .data$variable,
                           .data$Pleasant, .data$Long, .data$Plainfield) %>%
                    unique() %>%
                    melt(id.vars = c("hydrology", "metric", "variable")) 
colnames(rules)  <- c("hydrology", "metric", "variable", "lake", "value")
rules            <- rules %>%
                    filter(.data$value == TRUE) %>%
                    select(.data$lake, .data$hydrology, .data$metric, .data$variable)
useful_metrics   <- unique(rules[,c("lake", "hydrology", "metric", "variable")])
MODFLOW_metrics  <- CSLSscenarios::MODFLOW_metrics %>%
                    filter(.data$series == "month",
                           .data$scenario %in% c("no_irr", "cur_irr")) %>%
                    inner_join(useful_metrics,
                               by = c("lake", "metric", "variable"))

summary_metrics  <- MODFLOW_metrics %>%
                    group_by(.data$lake, .data$scenario,
                             .data$metric, .data$variable) %>%
                    summarise(base = .data$value[which(.data$sim == 1)],
                              q25 = quantile(.data$value,
                                             probs = 0.75,
                                             type = 6,
                                             na.rm = TRUE),
                              q75 = quantile(.data$value,
                                             probs = 0.25,
                                             type = 6,
                                             na.rm = TRUE),
                              IQR = IQR(.data$value, 
                                        type = 6,
                                        na.rm = TRUE),
                              upper = .data$q25 + 1.5*.data$IQR,
                              lower = .data$q75 - 1.5*.data$IQR,
                              sd = sd(.data$value, na.rm = TRUE),
                              .groups = "drop") %>%
                    mutate_at(c("base", "q25", "q75", "IQR", "upper", 
                                "lower", "sd"), as.numeric)

outliers        <- MODFLOW_metrics %>%
                   left_join(summary_metrics, 
                             by = c("lake", "scenario", "metric", "variable")) %>%
                   mutate(outlier = ifelse((.data$value > .data$upper | 
                                              .data$value < .data$lower),
                                           TRUE, FALSE))
outliers$sim     <- factor(outliers$sim)
outliers$outlier <- factor(outliers$outlier)

Overview

This is an exploration of the Monte Carlo SWB simualations, focused on the no-irrigated-agriculture and the difference between no-irrigated-agriculture and current-irrigated-agriculture. We tried to use this to understand if any simulations were outliers, and if so, what caused the differences. Ultimately, we did not "throw out" any simulations, though we focused primarily on the "base" parameterization of SWB, as well as the "large drawdown", and "small drawdown" parameterizations.

I limited this evaluation to only those hydrologic metrics which we use for at least one of our ecological rules. This is <50% of all metrics we calculate, but are the metrics most meaningful to us.


Plots

out1 <- c(304, 233, 25)
out2 <- c(116, 224)


MODFLOW <- CSLSdata::MODFLOW %>%
           filter(year(.data$date) >= 1986,
                  year(.data$date) <= 2018) %>%
           select(.data$scenario, .data$sim, .data$lake, 
                  .data$level_m, .data$date)
MOD1    <- MODFLOW %>% filter(.data$scenario == "no_irr")
MOD2    <- MODFLOW %>% filter(.data$scenario == "cur_irr")
DIFF    <- full_join(MOD1, MOD2,
                     by = c("lake", "sim", "date")) %>%
           mutate(diff = .data$level_m.y - .data$level_m.x)


ggplot() +
  geom_line(data = filter(MOD1, .data$sim %in% c(out1, out2)),
            aes(x = .data$date,
                y = .data$level_m,
                group = as.factor(.data$sim),
                color = as.factor(.data$sim),
                linetype = "No Irr")) +
  geom_line(data = filter(MOD2, .data$sim %in% out1),
            aes(x = .data$date,
                y = .data$level_m,
                group = as.factor(.data$sim),
                color = as.factor(.data$sim),
                linetype = "Current Irr")) +
  scale_linetype_manual(breaks = c("No Irr", "Current Irr"),
                        values = c("solid", "dashed")) +
  scale_color_brewer(name = "Simulation",
                     palette = "Dark2") +
  labs(x = "", y = "Lake Elevation (m)", linetype = "") +
  facet_wrap(~lake, ncol = 1, scales = "free_y") +
  theme_bw()


ggplot() +
  geom_line(data = DIFF,
            aes(x = .data$date,
                y = .data$diff,
                group = as.factor(.data$sim)),
            color = "black",
            alpha = 0.05) +
  geom_line(data = filter(DIFF, .data$sim %in% out1),
            aes(x = .data$date,
                y = .data$diff,
                group = as.factor(.data$sim),
                color = "Reg Outliers")) +
  geom_line(data = filter(DIFF, .data$sim %in% out2),
            aes(x = .data$date,
                y = .data$diff,
                group = as.factor(.data$sim),
                color = "Diff Outliers")) +
  scale_color_manual(name = "Outlier Type",
                     breaks = c("Reg Outliers",
                                "Diff Outliers"),
                     values = c("blue", "red")) +
  labs(x = "", y = "Difference (m)") +
  facet_wrap(~lake, ncol = 1, scales = "free_y") +
  theme_bw()



ggplot() +
  geom_line(data = MOD1,
            aes(x = .data$date,
                y = .data$level_m,
                group = as.factor(.data$sim)),
            color = "black",
            alpha = 0.05) +
  geom_line(data = filter(MOD1, .data$sim %in% c(322)),
            aes(x = .data$date,
                y = .data$level_m,
                group = as.factor(.data$sim)),
            color = "red") +
  labs(x = "", y = "Lake Elevation (m)", linetype = "") +
  facet_wrap(~lake, ncol = 1, scales = "free_y") +
  theme_bw()

Evaluate scenario value independently

I'd consider the top 3 outliers to be #304, #233, and #25, but other simulations still have substantial amounts of outliers for the metrics we care about.


All outliers

Evaluate all metrics used for ecological rules, for all 3 lakes, for all 2 scenarios ("cur_irr" and "no irr"). Total metrics evaluated:


Top 10 outlier simulations

outliers_all <- outliers %>%
                group_by(.data$outlier, .data$sim, .drop = FALSE) %>%
                summarise(count = n(),
                          .groups = "drop") %>%
                filter(.data$outlier == "TRUE") %>%
                arrange(desc(.data$count)) %>%
                select(.data$sim, .data$count)
datatable(outliers_all)


outliers_all$index <- row.names(outliers_all)
ggplot(outliers_all) +
  geom_point(aes(x = as.numeric(.data$index),
                 y = .data$count)) +
  labs(x = "Simulation Rank",
       y = "Number of Outliers in Simulation") +
  theme_bw()
ggplot(outliers_all) +
  geom_histogram(aes(x = .data$count),
                 binwidth = 1) +
  labs(x = "Number of Outliers",
       y = "Number of Simulations") +
  theme_bw()


Break down by: hydrology

Break outliers down by type of metric, but lump scenarios and lakes. Total metrics evaluated:


check <- outliers %>% 
         count(.data$sim, 
               .data$hydrology,  
               .drop = FALSE)

outliers_sub <- outliers %>%
                group_by(.data$outlier, .data$sim, 
                         .data$hydrology,  
                         .drop = FALSE) %>%
                summarise(count = n(),
                          .groups = "drop") %>%
                filter(.data$outlier == "TRUE") %>%
                filter(!is.na(.data$hydrology)) %>%
                select(.data$hydrology, .data$sim, .data$count) %>%
                dcast(sim~hydrology, value.var = "count",
                      fill = 0) 
datatable(outliers_sub)


plot_outliers_sub <- melt(outliers_sub, id.vars = "sim")
ggplot(plot_outliers_sub) +
  geom_histogram(aes(x = .data$value),
                 binwidth = 1) +
  facet_wrap(~variable) +
  labs(x = "Number of Outliers",
       y = "Number of Simulations") +
  theme_bw()


Break down by: hydrology and scenario

Break outliers down by type of metric and scenario but lump by lake. Total metrics evaluated for each scenario:


outliers_sub <- outliers %>%
                group_by(.data$outlier, .data$sim, 
                         .data$hydrology, .data$scenario, 
                         .drop = FALSE) %>%
                summarise(count = n(),
                          .groups = "drop") %>%
                filter(.data$outlier == "TRUE") %>%
                filter(!is.na(.data$hydrology)) %>%
                select(.data$hydrology, .data$scenario, .data$sim, .data$count) %>%
                dcast(sim+scenario~hydrology, value.var = "count",
                      fill = 0) 
datatable(outliers_sub)


Evaluate paired differences

I'd focus on the top 2 outliers as #224 and #116, but there's an even fuzzier line for the top outlier simulations here.

Outliers for differences

Evaluate outliers in metric differences (no_irr - cur_irr). Total metrics evaluated:

df1 <- MODFLOW_metrics %>% 
       filter(.data$scenario == "no_irr") %>%
       select(.data$hydrology, .data$metric, .data$variable, .data$lake, 
              .data$sim, .data$value)
df2 <- MODFLOW_metrics %>% 
       filter(.data$scenario == "cur_irr")  %>%
       select(.data$hydrology, .data$metric, .data$variable, .data$lake, 
              .data$sim, .data$value)
combined <- full_join(df1, df2, 
                      by = c("lake", "sim", "metric", "variable", "hydrology"))
combined$diff <- combined$value.x - combined$value.y

summary_combined <- combined %>%
                    group_by(.data$lake, .data$hydrology, .data$metric, 
                             .data$variable) %>%
                    summarise(base = .data$diff[which(.data$sim == 1)],
                              q25 = quantile(.data$diff,
                                             probs = 0.75,
                                             type = 6,
                                             na.rm = TRUE),
                              q75 = quantile(.data$diff,
                                             probs = 0.25,
                                             type = 6,
                                             na.rm = TRUE),
                              IQR = IQR(.data$diff, 
                                        type = 6,
                                        na.rm = TRUE),
                              upper = .data$q25 + 1.5*.data$IQR,
                              lower = .data$q75 - 1.5*.data$IQR,
                              sd = sd(.data$diff, na.rm = TRUE),
                              .groups = "drop") %>%
                    mutate_at(c("base", "q25", "q75", "IQR", "upper", 
                                "lower", "sd"), as.numeric)
outliers        <- combined %>%
                   left_join(summary_combined, 
                             by = c("lake", "hydrology", 
                                    "metric", "variable")) %>%
                   mutate(outlier = ifelse((.data$diff > .data$upper | 
                                              .data$diff < .data$lower),
                                           TRUE, FALSE))
outliers$sim     <- factor(outliers$sim)
outliers$outlier <- factor(outliers$outlier)

outliers_all <- outliers %>%
                group_by(.data$outlier, .data$sim, 
                         .drop = FALSE) %>%
                summarise(count = n(),
                          .groups = "drop") %>%
                filter(.data$outlier == TRUE) %>%
                select(.data$sim, .data$count) %>%
                arrange(desc(.data$count))
datatable(outliers_all)


outliers_all$index <- row.names(outliers_all)
ggplot(outliers_all) +
  geom_point(aes(x = as.numeric(.data$index),
                 y = .data$count)) +
  labs(x = "Simulation Rank",
       y = "Number of Outliers in Simulation") +
  theme_bw()
ggplot(outliers_all) +
  geom_histogram(aes(x = .data$count),
                 binwidth = 1) +
  labs(x = "Number of Outliers",
       y = "Number of Simulations") +
  theme_bw()


Break down by: hydrology

Break outliers down by type of metric, but lump scenarios and lakes. Total metrics evaluated:


outliers_sub <- outliers %>%
                group_by(.data$outlier, .data$sim, 
                         .data$hydrology,  
                         .drop = FALSE) %>%
                summarise(count = n(),
                          .groups = "drop") %>%
                filter(.data$outlier == "TRUE") %>%
                filter(!is.na(.data$hydrology)) %>%
                select(.data$hydrology, .data$sim, .data$count) %>%
                dcast(sim~hydrology, value.var = "count") 

datatable(outliers_sub)


plot_outliers_sub <- melt(outliers_sub, id.vars = "sim")
ggplot(plot_outliers_sub) +
  geom_histogram(aes(x = .data$value),
                 binwidth = 1) +
  facet_wrap(~variable) +
  labs(x = "Number of Outliers",
       y = "Number of Simulations") +
  theme_bw()


WDNR-Water-Use/CSLSscenarios documentation built on Nov. 10, 2021, 4:14 p.m.