The threshold approach intends to find the optimal response threshold at the inflection point (or "elbow") based on the data (x = threshold; y = pooled variance), manifested by a exponential relationship between threshold and pooled variance.
However, there are cases that there are no such inflection point existed: 1. a linear relationship between threshold and pooled variance (too much noise in the dataset) 2. a polynomial relationship between threshold and pooled variance (too little noise in the dataset)
To provide a funtion that can inform the user when the threshold approach cannnot be used or be warned.
library(here) library(dplyr) library(tidyr) library(purrr) library(ggplot2) library(stringr) library(readr) library(Rcurvep)
percentd <- read_tsv( here("data-raw", "bmr_exclude", "percent_thres_all.txt"), col_types = cols()) distmd <- read_tsv( here("data-raw", "bmr_exclude", "distmoved_thres_all.txt"), col_types = cols()) simid <- read_tsv( here("data-raw", "bmr_exclude", "simi_thres_all.txt"), col_types = cols()) allcurved <- percentd %>% bind_rows(distmd) %>% bind_rows(simid) sigcurved <- allcurved %>% filter( endpoint %in% c("percent_affected_96", "percent_mortality_96", "120_a_L1_distmoved") )
#https://stats.stackexchange.com/questions/11947/fitting-an-exponential-model-to-data nls_fit <- function(dd) { mod <- nls(pooled_variance ~ exp(a + b * threshold), data = dd, start = list(a = 0, b = 0)) result <- dd %>% mutate( fitted_value = fitted(mod) ) return(list(update_dataset = result, model = mod)) }
nls_fit_sp <- function(dd) { mod_lm <- lm(log(pooled_variance) ~ threshold , data = dd) mod <- nls(pooled_variance ~ exp(a + b * threshold), data = dd, start = list(a = coef(mod_lm)[1], b = coef(mod_lm)[2])) result <- dd %>% mutate( fitted_value = fitted(mod) ) return(list(update_dataset = result, model = mod)) }
http://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/&sa=D&source=hangouts&ust=1547125299019000&usg=AFQjCNGzoP26jIUYQnAS9Z5Eb4aO66IzPg
nls_fit_SS <- function(dd) { mod <- nls(pooled_variance ~ SSasymp(threshold, yf, y0, log_alpha), data = dd) result <- dd %>% mutate( fitted_value = fitted(mod) ) return(list(update_dataset = result, model = mod)) }
# Joanne's suggestions lm_log_fit <- function(dd) { mod <- lm(log(pooled_variance) ~ threshold , data = dd) result <- dd %>% mutate( fitted_value = exp(fitted(mod)) ) return(list(update_dataset = result, model = mod)) }
#https://stats.stackexchange.com/questions/240455/fitting-exponential-regression-model-by-mle glm_gamma_fit <- function(dd) { mod <- glm(pooled_variance ~ threshold , data = dd, family = Gamma(link = "inverse")) result <- dd %>% mutate( fitted_value = fitted(mod) ) return(list(update_dataset = result, model = mod)) }
lm_fit <- function(dd) { mod <- lm(pooled_variance ~ threshold , data = dd) result <- dd %>% mutate( fitted_value = fitted(mod) ) return(list(update_dataset = result, model = mod)) }
plot_fit_value <- function(dd) { p <- ggplot(dd, aes(x = threshold, y = pooled_variance)) p <- p + geom_point(size = 2, alpha = 0.7) + geom_line(aes(y = fitted_value), color = "red") + facet_wrap( ~ endpoint, scales = "free") return(p) }
cal_fitted_cor <- function(dd) { result <- cor(dd$pooled_variance, dd$fitted_value) return(result) }
add_threshold_flag <- function(threshold_data, exp_fit, linear_fit) { exp_out <- map_dbl(exp_fit, ~ cal_fitted_cor(.x[[1]])) %>% tibble(exp_r2 = ., endpoint = names(exp_fit)) linear_out <- map_dbl(linear_fit, ~ cal_fitted_cor(.x[[1]])) %>% tibble(linear_r2 = ., endpoint = names(linear_fit)) exp_out <- exp_out %>% left_join( linear_out, by = "endpoint" ) %>% mutate( threshold_flag = case_when( exp_r2 >= 0.95 & linear_r2 >= 0.95 ~ "warning", exp_r2 < 0.95 & linear_r2 < 0.95 ~ "do not use", TRUE ~ as.character(NA) ) ) result <- threshold_data %>% filter(thresDist == 1) %>% left_join(exp_out, by = "endpoint") return(result) }
nlsd <- sigcurved %>% split(.$endpoint) %>% map(., nls_fit) plot_fit_value(map_df(nlsd, ~ .x[[1]]))
The fits look good. Yet it will be better if the second one can fit better
nlsd <- sigcurved %>% split(.$endpoint) %>% map(., nls_fit_sp) plot_fit_value(map_df(nlsd, ~ .x[[1]]))
nlsdS <- sigcurved %>% split(.$endpoint) %>% map(., nls_fit_SS) plot_fit_value(map_df(nlsdS, ~ .x[[1]]))
glmd <- sigcurved %>% split(.$endpoint) %>% map(., glm_gamma_fit) plot_fit_value(map_df(glmd, ~ .x[[1]]))
The fits do not look good. split(.$endpoint) %>% map(., lm_fit)
lm_logd <- sigcurved %>% split(.$endpoint) %>% map(., lm_log_fit) plot_fit_value(map_df(lm_logd, ~ .x[[1]]))
The fits do not look good.
lmd <- sigcurved %>% split(.$endpoint) %>% map(., lm_fit) plot_fit_value(map_df(lmd, ~ .x[[1]]))
The fits do not look good.
map_dbl(nlsdS, ~ cal_fitted_cor(.x[[1]]))
map_dbl(lmd, ~ cal_fitted_cor(.x[[1]]))
The linear curve shows high R2 in both nls + exponential fit and lm (linear fit). But the exponential curve shows high R2 only in nls + exponential fit. And the polynomial curve shows poor R2 in both nls + exponential fit and lm (linear fit).
add_threshold_flag(threshold_data = sigcurved, exp_fit = nlsd, linear_fit = lmd)
The 120_a_L1_distmoved endpoint got a 'warning' flag and the percent_mortality_96 endpoint got a 'do not use' flag.
nlsd_all <- allcurved %>% split(.$endpoint) %>% map(., nls_fit_SS) plot_fit_value(map_df(nlsd_all, ~ .x[[1]]))
lmd_all <- allcurved %>% split(.$endpoint) %>% map(., lm_fit) plot_fit_value(map_df(lmd_all, ~ .x[[1]]))
add_threshold_flag(threshold_data = allcurved, exp_fit = nlsd_all, linear_fit = lmd_all)
The 120_a_L1_distmoved endpoint got a 'warning' flag and the percent_mortality_96 and percent_mortality_48 endpoint got a 'do not use' flag.
Based on the current dataset,
`
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.