library(dplyr) library(tidyr) library(ggplot2) #library(ordinal) library(purrr) library(mgcv) library(ecostattoolkit) knitr::opts_chunk$set(tidy=TRUE, warning=FALSE, echo=FALSE, dpi = 300, cache = T, autodep = T, fig.height = 6, fig.width = 5)
EQR_boundaries <- data.frame(Boundary = c("HG", "GM", "MP"), Value = c(0.80, 0.60, 0.40)) log10_TP_slope <- -1 log10_TP_intercept <- -(log10(50) * log10_TP_slope - EQR_boundaries[EQR_boundaries$Boundary=="GM", "Value"]) set.seed(2) get_sim_dat <- function(n, xerr = FALSE, slope = -1, boundaries = EQR_boundaries){ log10_TP_slope <- slope log10_TP_intercept <- -(log10(50) * log10_TP_slope - boundaries[boundaries$Boundary=="GM", "Value"]) sim_dat <- data.frame(Region = rep(c("Few low nutrient lakes", "Good data span", "Few high nutrient lakes"), each = n), TP = c( 10 ^ rnorm(n, log10(100), 0.25), 10 ^ rnorm(n, log10(50), 0.25), 10 ^ rnorm(n, log10(35), 0.25)) ) %>% mutate(log10_TP = log10(TP), EQR = log10_TP_intercept + log10_TP_slope * log10(TP) + rnorm(n, 0, 0.3)) %>% mutate(Bio_Class = cut(EQR, breaks = c(-Inf, boundaries$Value, Inf), labels = rev(c("High", "Good", "Moderate", "Poor")), ordered_result = TRUE), Bio_Class_2 = cut(EQR, c(-Inf, boundaries[boundaries$Boundary=="GM","Value"], Inf), labels = c("Mod_or_Worse", "Good_or_better")), Bio_Class_num = as.numeric(Bio_Class)+1) %>% tbl_df() %>% mutate(log10_TP = if(xerr){log10_TP + rnorm(n, 0, xerr)}else{log10_TP}, TP = 10^log10_TP) sim_dat <- sim_dat %>% filter(Region != "Few high nutrient lakes") return(sim_dat) } sim_dat <- get_sim_dat(n = 100, xerr = F)
In preparation for the meeting I compared the methods provided in the toolkit. I used simulated data so that I could test each method 100s of times against data with a known relationship between TP and Ecological Status, and with different types of random noise. I also compared how well they work when there are few lakes with low nutrients.
The data are set up so that the boundary between moderate and good status occurs at TP of 50 µg/L.
# p <- sim_dat %>% # ggplot(aes(x = TP)) %>% # + geom_histogram(bins = 15) %>% # + facet_wrap( ~ Region, ncol = 1) %>% # + scale_x_continuous(trans = "log10", breaks = c(1, 5, 10, 50, 100, 500, 1000)) %>% # + annotation_logticks(sides = "b") # p
# p <- sim_dat %>% # ggplot(aes(x = EQR)) %>% # + geom_histogram(bins = 15) %>% # + facet_wrap( ~ Region, ncol = 1) #%>% # p
p <- sim_dat %>% ggplot(aes(x = TP, y = EQR)) %>% + geom_point() %>% + geom_smooth(method = "lm") %>% + facet_wrap( ~ Region, ncol = 1) %>% + scale_x_continuous("TP [µg/L]", trans = "log10", breaks = c(1, 5, 10, 50, 100, 500, 1000)) %>% + annotation_logticks(sides = "b") %>% + geom_hline(data = filter(EQR_boundaries, Boundary == "GM"), aes(yintercept = Value, colour = Boundary)) %>% + geom_vline(data = filter(EQR_boundaries, Boundary == "GM"), aes(xintercept = 50, colour = Boundary)) p
plot_sim_out <- function(dat){ p <- dat %>% mutate(GM = ifelse(GM > 1000, Inf, GM)) %>% ggplot(aes(x = Method, y = GM, fill = Method_type)) %>% + geom_hline(yintercept = 50) %>% + geom_boxplot() %>% + facet_wrap(~ Region, nrow = 3) %>% + labs(y = "Target nutrient concentration") %>% + expand_limits(y = 0) return(p) }
status_boundaries <- c("HG" = 0.8, "GM" = 0.6, "MP" = 0.4) method_types = c(OLS = "Regression", MM = "Mismatch", Median = "Quantile", q75 = "Quantile", AdjQ = "Quantile", RMA = "Regression")
set.seed(1) sim_out_errY <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = F) %>% mutate(Rep = x)) %>% tbl_df() boundaries_errY <- sim_out_errY %>% group_by(Region, Rep) %>% do(boundaries_all(., xvar = "log10_TP", yvar = "EQR", status_boundaries = status_boundaries, class_var = "Bio_Class")) %>% mutate_if(is.numeric, funs((10^.))) %>% mutate(Method_type = method_types[match(Method, names(method_types))], Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
boundaries_errY %>% plot_sim_out(.) + labs(title = "Error only in EQR")
sim_out_errXY <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0.3) %>% mutate(Rep = x)) %>% tbl_df() boundaries_errXY <- sim_out_errXY %>% group_by(Region, Rep) %>% do(boundaries_all(., xvar = "log10_TP", yvar = "EQR", status_boundaries = status_boundaries, class_var = "Bio_Class")) %>% mutate_if(is.numeric, funs((10^.))) %>% mutate(Method_type = method_types[match(Method, names(method_types))], Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
boundaries_errXY %>% plot_sim_out(.) + labs(title = "Equal observation error\nin nutrient and EQR measurements")
# # No causal relationship # # Given a good data span, estimates by most methods are centered on the median concentration in the data set. # # A causal relationship should however be test for in advance # # sim_out_slope0 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = 0) %>% # mutate(Rep = x)) %>% # tbl_df() # # boundaries_slope0 <- sim_out_slope0 %>% # group_by(Region, Rep) %>% # do(boundaries_all(., xvar = "log10_TP", yvar = "EQR", # status_boundaries = status_boundaries, # class_var = "Bio_Class")) %>% # mutate_if(is.numeric, funs((10^.))) %>% # mutate(Method_type = method_types[match(Method, names(method_types))], # Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T)) # # # boundaries_slope0 %>% plot_sim_out(.) + # labs(title = "No relationship between EQR and TP")
# Wild outliers sim_out_outliers <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -1) %>% mutate(Rep = x, EQR = EQR + rbinom(length(EQR), 1, 0.2) * rt(length(EQR), 2), Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf), labels = rev(c("High", "Good", "Moderate", "Poor")), ordered_result = TRUE))) %>% tbl_df() boundaries_outliers <- sim_out_outliers %>% group_by(Region, Rep) %>% do(boundaries_all(., xvar = "log10_TP", yvar = "EQR", status_boundaries = status_boundaries, class_var = "Bio_Class")) %>% mutate_if(is.numeric, funs((10^.))) %>% mutate(Method_type = factor(method_types[match(Method, names(method_types))]), Method = factor(Method, levels = c("Median", "q75", "AdjQ", "OLS", "RMA", "MM"), ordered = T))
Regression based methods are very sensitive to outliers. Quantile based methods are better than regression methods, but over estimate target when there are few low TP lakes. Mismatch method performs best.
boundaries_outliers %>% plot_sim_out(.) + labs(title = "Error only in EQR, but 20% outliers")
boundaries_outliers %>% subset(., Method_type %in% c("Regression") == FALSE, drop = FALSE) %>% plot_sim_out(.) + labs(title = "Error only in EQR, but 20% outliers") + scale_color_discrete(drop = FALSE)
## Wild outliers + slope -0-5 # sim_out_outliers1 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -0.5) %>% # mutate(Rep = x, # EQR = EQR + rbinom(length(EQR), 1, 0) * rt(length(EQR), 2), # Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf), # labels = rev(c("High", "Good", "Moderate", "Poor")), # ordered_result = TRUE))) %>% # tbl_df() %>% # get_estimates(.) %>% # mutate(Outliers = "0%") # # # sim_out_outliers2 <- plyr::ldply(1:100, function(x) get_sim_dat(60, xerr = 0, slope = -0.5) %>% # mutate(Rep = x, # EQR = EQR + rbinom(length(EQR), 1, 0.2) * rt(length(EQR), 2), # Bio_Class = cut(EQR, breaks = c(-Inf, EQR_boundaries$Value, Inf), # labels = rev(c("High", "Good", "Moderate", "Poor")), # ordered_result = TRUE))) %>% # tbl_df() %>% # get_estimates(.) %>% # mutate(Outliers = "20%") # # sim_out <- bind_rows(sim_out_outliers1, sim_out_outliers2) # # plot_sim_out(sim_out) %>% # + facet_wrap(~Region + Outliers, labeller = "label_both", ncol = 2)
Given the stability of the mismatch method in the face of outliers, and the fact that real data will have measurement error in both EQR and nutrient concentrations (e.g. TP) - the Mismatch method might be a good compromise.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.