Introduction

The threshold approach intends to find the optimal response threshold at the inflection point (or "elbow") based on the data (x = threshold; y = pooled variance), manifested by a exponential relationship between threshold and pooled variance.

However, there are cases that there are no such inflection point existed: 1. a linear relationship between threshold and pooled variance (too much noise in the dataset) 2. a polynomial relationship between threshold and pooled variance (too little noise in the dataset)

Goal

To provide a funtion that can inform the user when the threshold approach cannnot be used or be warned.

Steps

  1. Investigate approaches to do curve fitting
  2. Test approaches using three signature endpoints representing three types (exponential, linear, polynomial)
  3. Identify the best approach for exponential curve fit
  4. Apply the approach on all the endpoints

read required R packages

library(here)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(stringr)
library(readr)
library(Rcurvep)

get all and the signature endpoints

allcurved (all endpoints)

sigcurved (signature endpoints)

percentd <- read_tsv(
  here("data-raw", "bmr_exclude", "percent_thres_all.txt"), col_types = cols())
distmd <- read_tsv(
  here("data-raw", "bmr_exclude", "distmoved_thres_all.txt"), col_types = cols())
simid <- read_tsv(
  here("data-raw", "bmr_exclude", "simi_thres_all.txt"), col_types = cols())

allcurved <- percentd %>% bind_rows(distmd) %>% bind_rows(simid)
sigcurved <- allcurved %>%
  filter(
    endpoint %in% c("percent_affected_96", "percent_mortality_96", "120_a_L1_distmoved")
  )

fitting functions

nls + exponential

#https://stats.stackexchange.com/questions/11947/fitting-an-exponential-model-to-data

nls_fit <- function(dd) {
  mod <- nls(pooled_variance ~ exp(a + b * threshold), 
             data = dd, start = list(a = 0, b = 0))
  result <- dd %>%
    mutate(
      fitted_value = fitted(mod)
    )
  return(list(update_dataset = result, model = mod))
}

nls fit with starting point

nls_fit_sp <- function(dd) {

  mod_lm <- lm(log(pooled_variance) ~ threshold , data = dd)
  mod <- nls(pooled_variance ~ exp(a + b * threshold), 
             data = dd, start = list(a = coef(mod_lm)[1], b = coef(mod_lm)[2]))
  result <- dd %>%
    mutate(
      fitted_value = fitted(mod)
    )
  return(list(update_dataset = result, model = mod))
}

nls fit with starting point search function

http://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/&sa=D&source=hangouts&ust=1547125299019000&usg=AFQjCNGzoP26jIUYQnAS9Z5Eb4aO66IzPg

nls_fit_SS <- function(dd) {
  mod <- nls(pooled_variance ~ SSasymp(threshold, yf, y0, log_alpha), data = dd)
  result <- dd %>%
    mutate(
      fitted_value = fitted(mod)
    )
  return(list(update_dataset = result, model = mod))
}

lm + log(y)

# Joanne's suggestions

lm_log_fit <- function(dd) {
  mod <- lm(log(pooled_variance) ~ threshold , data = dd)
  result <- dd %>%
    mutate(
      fitted_value = exp(fitted(mod))
    )
  return(list(update_dataset = result, model = mod))
}

glm + gamma

#https://stats.stackexchange.com/questions/240455/fitting-exponential-regression-model-by-mle

glm_gamma_fit <- function(dd) {
  mod <- glm(pooled_variance ~ threshold , data = dd, family = Gamma(link = "inverse"))
  result <- dd %>%
    mutate(
      fitted_value = fitted(mod)
    )
  return(list(update_dataset = result, model = mod))
}

lm (simple linear regression)

lm_fit <- function(dd) {
  mod <- lm(pooled_variance ~ threshold , data = dd)
  result <- dd %>%
    mutate(
      fitted_value = fitted(mod)
    )
  return(list(update_dataset = result, model = mod))
}

plot data and fitting results

plot_fit_value <- function(dd) {
  p <- ggplot(dd, aes(x = threshold, y = pooled_variance))
  p <- p + geom_point(size = 2, alpha = 0.7) + 
    geom_line(aes(y = fitted_value), color = "red") + 
    facet_wrap( ~ endpoint, scales = "free")
  return(p)
}

calculate R2 between original values and fitted values

cal_fitted_cor <- function(dd) {
  result <- cor(dd$pooled_variance, dd$fitted_value)
  return(result)
}

compare R2 between nls + exponential and linear fit & add flag column

add_threshold_flag <- function(threshold_data, exp_fit, linear_fit) {

  exp_out <- map_dbl(exp_fit, ~ cal_fitted_cor(.x[[1]])) %>% 
    tibble(exp_r2 = ., endpoint = names(exp_fit))
  linear_out <- map_dbl(linear_fit, ~ cal_fitted_cor(.x[[1]])) %>% 
    tibble(linear_r2 = ., endpoint = names(linear_fit))

  exp_out <- exp_out %>%
    left_join(
      linear_out, by = "endpoint"
    ) %>%
    mutate(
      threshold_flag = case_when(
        exp_r2 >= 0.95 & linear_r2 >= 0.95 ~ "warning",
        exp_r2 < 0.95 & linear_r2 < 0.95 ~ "do not use",
        TRUE ~ as.character(NA)
      )
    )

  result <- threshold_data %>%
    filter(thresDist == 1) %>%
    left_join(exp_out, by = "endpoint")

  return(result)

}

Tests on signature endpoints

nls_fit

nlsd <- sigcurved %>% 
  split(.$endpoint) %>%
  map(., nls_fit)

plot_fit_value(map_df(nlsd, ~ .x[[1]])) 

The fits look good. Yet it will be better if the second one can fit better

nls fit + sp

nlsd <- sigcurved %>% 
  split(.$endpoint) %>%
  map(., nls_fit_sp)

plot_fit_value(map_df(nlsd, ~ .x[[1]]))

nls fit + SS

nlsdS <- sigcurved %>% 
  split(.$endpoint) %>%
  map(., nls_fit_SS)

plot_fit_value(map_df(nlsdS, ~ .x[[1]]))

glm_gamma_fit

glmd <- sigcurved %>% 
  split(.$endpoint) %>%
  map(., glm_gamma_fit)

plot_fit_value(map_df(glmd, ~ .x[[1]]))

The fits do not look good. split(.$endpoint) %>% map(., lm_fit)

lm_log_fit

lm_logd <- sigcurved %>% 
  split(.$endpoint) %>%
  map(., lm_log_fit)

plot_fit_value(map_df(lm_logd, ~ .x[[1]]))

The fits do not look good.

lm_fit

lmd <- sigcurved %>% 
  split(.$endpoint) %>%
  map(., lm_fit)

plot_fit_value(map_df(lmd, ~ .x[[1]]))

The fits do not look good.

Identification of parameters to single out the exponential distribution vs others

map_dbl(nlsdS, ~ cal_fitted_cor(.x[[1]]))
map_dbl(lmd, ~ cal_fitted_cor(.x[[1]]))

The linear curve shows high R2 in both nls + exponential fit and lm (linear fit). But the exponential curve shows high R2 only in nls + exponential fit. And the polynomial curve shows poor R2 in both nls + exponential fit and lm (linear fit).

Apply the above logic to add flag on the thresholds

add_threshold_flag(threshold_data =  sigcurved, exp_fit = nlsd, linear_fit = lmd)

The 120_a_L1_distmoved endpoint got a 'warning' flag and the percent_mortality_96 endpoint got a 'do not use' flag.

Apply nls + exponential fit on all the endpoints

nlsd_all <- allcurved %>% 
  split(.$endpoint) %>%
  map(., nls_fit_SS)

plot_fit_value(map_df(nlsd_all, ~ .x[[1]])) 
lmd_all <- allcurved %>% 
  split(.$endpoint) %>%
  map(., lm_fit)

plot_fit_value(map_df(lmd_all, ~ .x[[1]]))
add_threshold_flag(threshold_data =  allcurved, exp_fit = nlsd_all, linear_fit = lmd_all)

The 120_a_L1_distmoved endpoint got a 'warning' flag and the percent_mortality_96 and percent_mortality_48 endpoint got a 'do not use' flag.

Summary

Based on the current dataset,

  1. the nls + exponential fit approach can generate reasonable fit, especially when SSasymp and a new equation is used .
  2. the correlation approach by combining information from both nls + exponential fit and linear fit approach can reliably provide the flag for the endpoints that should not use the threshold approach.

`



moggces/Rcurvep documentation built on Feb. 6, 2024, 3:30 a.m.