library(dplyr)
library(tidyr)
library(ggplot2)
#library(ordinal)
library(purrr)
library(mgcv)
library(ecostattoolkit)

knitr::opts_chunk$set(tidy=TRUE, warning=FALSE, echo=FALSE, dpi = 300, cache = T, autodep = T, fig.height = 6, fig.width = 5)
EQR_boundaries <- data.frame(Boundary = c("HG", "GM", "MP"), Value = c(0.80, 0.60, 0.40))

log10_TP_slope <- -1
log10_TP_intercept <- -(log10(50) * log10_TP_slope - EQR_boundaries[EQR_boundaries$Boundary=="GM", "Value"])

set.seed(2)

get_sim_dat <- function(n, xerr = FALSE, slope = -1, boundaries = EQR_boundaries){

  log10_TP_slope <- slope
  log10_TP_intercept <- -(log10(50) * log10_TP_slope - boundaries[boundaries$Boundary=="GM", "Value"])

  sim_dat <- data.frame(Region = rep(c("Few low nutrient lakes", "Good data span", "Few high nutrient lakes"),
                                     each = n),
                        TP = c(
                          10 ^ rnorm(n, log10(100), 0.25),
                          10 ^ rnorm(n, log10(50), 0.25),
                          10 ^ rnorm(n, log10(35), 0.25))
  ) %>%
    mutate(log10_TP = log10(TP),
           EQR = log10_TP_intercept + log10_TP_slope * log10(TP) + rnorm(n, 0, 0.3)) %>%
    mutate(Bio_Class = cut(EQR, breaks = c(-Inf, boundaries$Value, Inf),
                           labels = rev(c("High", "Good", "Moderate", "Poor")),
                           ordered_result = TRUE),
           Bio_Class_2 = cut(EQR, 
                             c(-Inf, boundaries[boundaries$Boundary=="GM","Value"], Inf),
                             labels = c("Mod_or_Worse", "Good_or_better")),
           Bio_Class_num = as.numeric(Bio_Class)+1) %>%   tbl_df() %>% 
    mutate(log10_TP  = if(xerr){log10_TP + rnorm(n, 0, xerr)}else{log10_TP},
           TP = 10^log10_TP)

  sim_dat <- sim_dat %>% 
    filter(Region != "Few high nutrient lakes")

  return(sim_dat)
}

sim_dat <- get_sim_dat(n = 100, xerr = F)

A comparison of methods provided in the ECOSTAT toolkit.

In preparation for the meeting I compared the methods provided in the toolkit. I used simulated data so that I could test each method 100s of times against data with a known relationship between TP and Ecological Status, and with different types of random noise. I also compared how well they work when there are few lakes with low nutrients.

Example simulated data

The data are set up so that the boundary between moderate and good status occurs at TP of 50 µg/L.

# p <- sim_dat %>% 
#   ggplot(aes(x = TP)) %>% 
#   + geom_histogram(bins = 15) %>% 
#   + facet_wrap( ~ Region, ncol = 1) %>% 
#   + scale_x_continuous(trans = "log10", breaks = c(1, 5, 10, 50, 100, 500, 1000)) %>% 
#   + annotation_logticks(sides = "b")
# p
# p <- sim_dat %>% 
#   ggplot(aes(x = EQR)) %>% 
#   + geom_histogram(bins = 15) %>% 
#   + facet_wrap( ~ Region, ncol = 1) #%>% 
# p
p <- sim_dat %>% 
  ggplot(aes(x = TP, y = EQR)) %>% 
  + geom_point() %>% 
  + geom_smooth(method = "lm") %>% 
  + facet_wrap( ~ Region, ncol = 1) %>% 
  + scale_x_continuous("TP [µg/L]", trans = "log10", breaks = c(1, 5, 10, 50, 100, 500, 1000)) %>% 
  + annotation_logticks(sides = "b") %>% 
  + geom_hline(data = filter(EQR_boundaries, Boundary == "GM"), aes(yintercept = Value, colour = Boundary)) %>% 
  + geom_vline(data = filter(EQR_boundaries, Boundary == "GM"), aes(xintercept = 50, colour = Boundary))
p
plot_sim_out <- function(dat){
  p <- dat %>% 
    mutate(GM = ifelse(GM > 1000, Inf, GM)) %>%
    ggplot(aes(x = Method, y = GM, fill = Method_type)) %>% 
  + geom_hline(yintercept = 50) %>% 
  + geom_boxplot() %>% 
  + facet_wrap(~ Region, nrow = 3) %>% 
  + labs(y = "Target nutrient concentration") %>% 
    + expand_limits(y = 0)
return(p)
}

Compare methods

With measurement error only in the EQR

status_boundaries <- c("HG" = 0.8, "GM" = 0.6, "MP" = 0.4)

method_types = c(OLS = "Regression", MM = "Mismatch", Median = "Quantile",
                   q75 = "Quantile", AdjQ = "Quantile", RMA = "Regression")
set.seed(1)
sim_out_errY <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = F) %>%
                              mutate(Rep = x)) %>% 
  tbl_df()

boundaries_errY <- sim_out_errY %>% 
  group_by(Region, Rep) %>% 
  do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
                             status_boundaries = status_boundaries,
                             class_var = "Bio_Class")) %>% 
  mutate_if(is.numeric, funs((10^.))) %>% 
  mutate(Method_type = method_types[match(Method, names(method_types))],
         Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
boundaries_errY %>% plot_sim_out(.) +
  labs(title = "Error only in EQR")

With equal (proportional) observation error in nutrient and EQR measurements

sim_out_errXY <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0.3) %>%
                              mutate(Rep = x)) %>%
  tbl_df() 

boundaries_errXY <- sim_out_errXY %>% 
  group_by(Region, Rep) %>% 
  do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
                             status_boundaries = status_boundaries,
                             class_var = "Bio_Class")) %>% 
  mutate_if(is.numeric, funs((10^.))) %>% 
  mutate(Method_type = method_types[match(Method, names(method_types))],
         Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
boundaries_errXY %>% plot_sim_out(.) +
  labs(title = "Equal observation error\nin nutrient and EQR measurements")
# # No causal relationship
# # Given a good data span, estimates by most methods are centered on the median concentration in the data set.
# # A causal relationship should however be test for in advance
# 
# sim_out_slope0 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = 0) %>%
#                               mutate(Rep = x)) %>%
#   tbl_df() 
# 
# boundaries_slope0 <- sim_out_slope0 %>% 
#   group_by(Region, Rep) %>% 
#   do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
#                              status_boundaries = status_boundaries,
#                              class_var = "Bio_Class")) %>% 
#   mutate_if(is.numeric, funs((10^.))) %>% 
#   mutate(Method_type = method_types[match(Method, names(method_types))],
#          Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
# 
# 
# boundaries_slope0 %>% plot_sim_out(.) +
#   labs(title = "No relationship between EQR and TP")

Extra "outliers"

# Wild outliers

sim_out_outliers <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -1) %>%
                              mutate(Rep = x,
                                     EQR = EQR + rbinom(length(EQR), 1, 0.2) * rt(length(EQR), 2),
                                     Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf),
                         labels = rev(c("High", "Good", "Moderate", "Poor")),
                         ordered_result = TRUE))) %>%
  tbl_df() 

boundaries_outliers <- sim_out_outliers %>% 
  group_by(Region, Rep) %>% 
  do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
                             status_boundaries = status_boundaries,
                             class_var = "Bio_Class")) %>% 
  mutate_if(is.numeric, funs((10^.))) %>% 
  mutate(Method_type = factor(method_types[match(Method, names(method_types))]),
         Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))

Regression based methods are very sensitive to outliers. Quantile based methods are better than regression methods, but over estimate target when there are few low TP lakes. Mismatch method performs best.

boundaries_outliers %>% plot_sim_out(.) +
  labs(title = "Error only in EQR, but 20% outliers")
boundaries_outliers %>% 
  subset(., Method_type %in% c("Regression") == FALSE, drop = FALSE) %>% 
  plot_sim_out(.) +
  labs(title = "Error only in EQR, but 20% outliers") + 
  scale_color_discrete(drop = FALSE)
## Wild outliers + slope -0-5

# sim_out_outliers1 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -0.5) %>%
#                               mutate(Rep = x, 
#                                      EQR = EQR + rbinom(length(EQR), 1, 0) * rt(length(EQR), 2),
#                                      Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf),
#                          labels = rev(c("High", "Good", "Moderate", "Poor")),
#                          ordered_result = TRUE))) %>%
#   tbl_df() %>%
#   get_estimates(.) %>% 
#   mutate(Outliers = "0%")
# 
# 
# sim_out_outliers2 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -0.5) %>%
#                               mutate(Rep = x, 
#                                      EQR = EQR + rbinom(length(EQR), 1, 0.2) * rt(length(EQR), 2),
#                                      Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf),
#                          labels = rev(c("High", "Good", "Moderate", "Poor")),
#                          ordered_result = TRUE))) %>%
#   tbl_df() %>%
#   get_estimates(.) %>% 
#   mutate(Outliers = "20%")
# 
# sim_out <- bind_rows(sim_out_outliers1, sim_out_outliers2)
# 
# plot_sim_out(sim_out) %>%  
#   + facet_wrap(~Region + Outliers, labeller = "label_both", ncol = 2)

Results summary

Conclusion

Given the stability of the mismatch method in the face of outliers, and the fact that real data will have measurement error in both EQR and nutrient concentrations (e.g. TP) - the Mismatch method might be a good compromise.



andrewdolman/ecostattoolkit documentation built on May 10, 2019, 10:30 a.m.