In andrewdolman/ecostattoolkit: An R package to implement and test the ECOSTAT toolkit for estimating nutrient boundaries

library(dplyr)
library(tidyr)
library(ggplot2)
#library(ordinal)
library(purrr)
library(mgcv)
library(ecostattoolkit)

knitr::opts_chunk$set(tidy=TRUE, warning=FALSE, echo=FALSE, dpi = 300, cache = T, autodep = T, fig.height = 6, fig.width = 5)

EQR_boundaries <- data.frame(Boundary = c("HG", "GM", "MP"), Value = c(0.80, 0.60, 0.40))

log10_TP_slope <- -1
log10_TP_intercept <- -(log10(50) * log10_TP_slope - EQR_boundaries[EQR_boundaries$Boundary=="GM", "Value"])

set.seed(2)

get_sim_dat <- function(n, xerr = FALSE, slope = -1, boundaries = EQR_boundaries){

  log10_TP_slope <- slope
  log10_TP_intercept <- -(log10(50) * log10_TP_slope - boundaries[boundaries$Boundary=="GM", "Value"])

  sim_dat <- data.frame(Region = rep(c("Few low nutrient lakes", "Good data span", "Few high nutrient lakes"),
                                     each = n),
                        TP = c(
                          10 ^ rnorm(n, log10(100), 0.25),
                          10 ^ rnorm(n, log10(50), 0.25),
                          10 ^ rnorm(n, log10(35), 0.25))
  ) %>%
    mutate(log10_TP = log10(TP),
           EQR = log10_TP_intercept + log10_TP_slope * log10(TP) + rnorm(n, 0, 0.3)) %>%
    mutate(Bio_Class = cut(EQR, breaks = c(-Inf, boundaries$Value, Inf),
                           labels = rev(c("High", "Good", "Moderate", "Poor")),
                           ordered_result = TRUE),
           Bio_Class_2 = cut(EQR, 
                             c(-Inf, boundaries[boundaries$Boundary=="GM","Value"], Inf),
                             labels = c("Mod_or_Worse", "Good_or_better")),
           Bio_Class_num = as.numeric(Bio_Class)+1) %>%   tbl_df() %>% 
    mutate(log10_TP  = if(xerr){log10_TP + rnorm(n, 0, xerr)}else{log10_TP},
           TP = 10^log10_TP)

  sim_dat <- sim_dat %>% 
    filter(Region != "Few high nutrient lakes")

  return(sim_dat)
}

sim_dat <- get_sim_dat(n = 100, xerr = F)

A comparison of methods provided in the ECOSTAT toolkit.

In preparation for the meeting I compared the methods provided in the toolkit. I used simulated data so that I could test each method 100s of times against data with a known relationship between TP and Ecological Status, and with different types of random noise. I also compared how well they work when there are few lakes with low nutrients.

Example simulated data

The data are set up so that the boundary between moderate and good status occurs at TP of 50 µg/L.

# p <- sim_dat %>% 
#   ggplot(aes(x = TP)) %>% 
#   + geom_histogram(bins = 15) %>% 
#   + facet_wrap( ~ Region, ncol = 1) %>% 
#   + scale_x_continuous(trans = "log10", breaks = c(1, 5, 10, 50, 100, 500, 1000)) %>% 
#   + annotation_logticks(sides = "b")
# p

# p <- sim_dat %>% 
#   ggplot(aes(x = EQR)) %>% 
#   + geom_histogram(bins = 15) %>% 
#   + facet_wrap( ~ Region, ncol = 1) #%>% 
# p

p <- sim_dat %>% 
  ggplot(aes(x = TP, y = EQR)) %>% 
  + geom_point() %>% 
  + geom_smooth(method = "lm") %>% 
  + facet_wrap( ~ Region, ncol = 1) %>% 
  + scale_x_continuous("TP [µg/L]", trans = "log10", breaks = c(1, 5, 10, 50, 100, 500, 1000)) %>% 
  + annotation_logticks(sides = "b") %>% 
  + geom_hline(data = filter(EQR_boundaries, Boundary == "GM"), aes(yintercept = Value, colour = Boundary)) %>% 
  + geom_vline(data = filter(EQR_boundaries, Boundary == "GM"), aes(xintercept = 50, colour = Boundary))
p

plot_sim_out <- function(dat){
  p <- dat %>% 
    mutate(GM = ifelse(GM > 1000, Inf, GM)) %>%
    ggplot(aes(x = Method, y = GM, fill = Method_type)) %>% 
  + geom_hline(yintercept = 50) %>% 
  + geom_boxplot() %>% 
  + facet_wrap(~ Region, nrow = 3) %>% 
  + labs(y = "Target nutrient concentration") %>% 
    + expand_limits(y = 0)
return(p)
}

Compare methods

Quantile methods
- Medians = Median nutrient concentration for Good lakes
- q75 = 75 percentile of nutrient concentration for Good lakes
- AdjQ = Adjusted Quantiles (mean of q25 for Moderate lakes and q75 for Good lakes)
Regression methods
- OLS = Ordinary Least Square Regression
- RMA = Ranged Major Axis regression (Type II regression)
Mismatch
- MM = Mismatch method

With measurement error only in the EQR

status_boundaries <- c("HG" = 0.8, "GM" = 0.6, "MP" = 0.4)

method_types = c(OLS = "Regression", MM = "Mismatch", Median = "Quantile",
                   q75 = "Quantile", AdjQ = "Quantile", RMA = "Regression")

set.seed(1)
sim_out_errY <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = F) %>%
                              mutate(Rep = x)) %>% 
  tbl_df()

boundaries_errY <- sim_out_errY %>% 
  group_by(Region, Rep) %>% 
  do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
                             status_boundaries = status_boundaries,
                             class_var = "Bio_Class")) %>% 
  mutate_if(is.numeric, funs((10^.))) %>% 
  mutate(Method_type = method_types[match(Method, names(method_types))],
         Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))

boundaries_errY %>% plot_sim_out(.) +
  labs(title = "Error only in EQR")

With equal (proportional) observation error in nutrient and EQR measurements

sim_out_errXY <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0.3) %>%
                              mutate(Rep = x)) %>%
  tbl_df() 

boundaries_errXY <- sim_out_errXY %>% 
  group_by(Region, Rep) %>% 
  do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
                             status_boundaries = status_boundaries,
                             class_var = "Bio_Class")) %>% 
  mutate_if(is.numeric, funs((10^.))) %>% 
  mutate(Method_type = method_types[match(Method, names(method_types))],
         Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))

boundaries_errXY %>% plot_sim_out(.) +
  labs(title = "Equal observation error\nin nutrient and EQR measurements")

# # No causal relationship
# # Given a good data span, estimates by most methods are centered on the median concentration in the data set.
# # A causal relationship should however be test for in advance
# 
# sim_out_slope0 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = 0) %>%
#                               mutate(Rep = x)) %>%
#   tbl_df() 
# 
# boundaries_slope0 <- sim_out_slope0 %>% 
#   group_by(Region, Rep) %>% 
#   do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
#                              status_boundaries = status_boundaries,
#                              class_var = "Bio_Class")) %>% 
#   mutate_if(is.numeric, funs((10^.))) %>% 
#   mutate(Method_type = method_types[match(Method, names(method_types))],
#          Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
# 
# 
# boundaries_slope0 %>% plot_sim_out(.) +
#   labs(title = "No relationship between EQR and TP")

Extra "outliers"

# Wild outliers

sim_out_outliers <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -1) %>%
                              mutate(Rep = x,
                                     EQR = EQR + rbinom(length(EQR), 1, 0.2) * rt(length(EQR), 2),
                                     Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf),
                         labels = rev(c("High", "Good", "Moderate", "Poor")),
                         ordered_result = TRUE))) %>%
  tbl_df() 

boundaries_outliers <- sim_out_outliers %>% 
  group_by(Region, Rep) %>% 
  do(boundaries_all(., xvar = "log10_TP", yvar = "EQR",
                             status_boundaries = status_boundaries,
                             class_var = "Bio_Class")) %>% 
  mutate_if(is.numeric, funs((10^.))) %>% 
  mutate(Method_type = factor(method_types[match(Method, names(method_types))]),
         Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))

Regression based methods are very sensitive to outliers. Quantile based methods are better than regression methods, but over estimate target when there are few low TP lakes. Mismatch method performs best.

boundaries_outliers %>% plot_sim_out(.) +
  labs(title = "Error only in EQR, but 20% outliers")

boundaries_outliers %>% 
  subset(., Method_type %in% c("Regression") == FALSE, drop = FALSE) %>% 
  plot_sim_out(.) +
  labs(title = "Error only in EQR, but 20% outliers") + 
  scale_color_discrete(drop = FALSE)

## Wild outliers + slope -0-5

# sim_out_outliers1 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -0.5) %>%
#                               mutate(Rep = x, 
#                                      EQR = EQR + rbinom(length(EQR), 1, 0) * rt(length(EQR), 2),
#                                      Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf),
#                          labels = rev(c("High", "Good", "Moderate", "Poor")),
#                          ordered_result = TRUE))) %>%
#   tbl_df() %>%
#   get_estimates(.) %>% 
#   mutate(Outliers = "0%")
# 
# 
# sim_out_outliers2 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -0.5) %>%
#                               mutate(Rep = x, 
#                                      EQR = EQR + rbinom(length(EQR), 1, 0.2) * rt(length(EQR), 2),
#                                      Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf),
#                          labels = rev(c("High", "Good", "Moderate", "Poor")),
#                          ordered_result = TRUE))) %>%
#   tbl_df() %>%
#   get_estimates(.) %>% 
#   mutate(Outliers = "20%")
# 
# sim_out <- bind_rows(sim_out_outliers1, sim_out_outliers2)
# 
# plot_sim_out(sim_out) %>%  
#   + facet_wrap(~Region + Outliers, labeller = "label_both", ncol = 2)

Results summary

All methods perform well when the data range is good and the data have no large outliers
Estimates are unbiased (centered around the true value, here 50) and with narrow spread
- q75 estimates a different quantity than all the other methods
When there are few low nutrient lakes, the quantile method overestimate nutrient boundaries badly. OLS is correct, Mismatch and RMA methods also overestimate but not as bad as quantile methods.
When there is observation error in the nutrient concentration estimates (here TP), and few low TP lakes, OLS estimates boundaries that are too low. RMA and MM get it correct. Quantile methods again overestimate.
When there are large outliers, regression methods (RMA and OLS) both can produce very large and small boundary estimates. Estimates from the Quantile and Mismatch methods are more stable, Mismatch is the best.

Conclusion

Given the stability of the mismatch method in the face of outliers, and the fact that real data will have measurement error in both EQR and nutrient concentrations (e.g. TP) - the Mismatch method might be a good compromise.

andrewdolman/ecostattoolkit documentation built on May 10, 2019, 10:30 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

andrewdolman/ecostattoolkit
An R package to implement and test the ECOSTAT toolkit for estimating nutrient boundaries

In andrewdolman/ecostattoolkit: An R package to implement and test the ECOSTAT toolkit for estimating nutrient boundaries

A comparison of methods provided in the ECOSTAT toolkit.

Example simulated data

Compare methods

With measurement error only in the EQR

With equal (proportional) observation error in nutrient and EQR measurements

Extra "outliers"

Results summary

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

andrewdolman/ecostattoolkit An R package to implement and test the ECOSTAT toolkit for estimating nutrient boundaries

In andrewdolman/ecostattoolkit: An R package to implement and test the ECOSTAT toolkit for estimating nutrient boundaries

A comparison of methods provided in the ECOSTAT toolkit.

Example simulated data

Compare methods

With measurement error only in the EQR

With equal (proportional) observation error in nutrient and EQR measurements

Extra "outliers"

Results summary

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

andrewdolman/ecostattoolkit
An R package to implement and test the ECOSTAT toolkit for estimating nutrient boundaries