In diazrenata/BBSstories: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(dplyr)

simulate_data <- function(ntimesteps, intercept, slope, sd_error) {

  vals <- intercept + ((1:ntimesteps) * slope) + rnorm(ntimesteps, sd = sd_error)

  return(data.frame(
    time = 1:ntimesteps,
    value = vals,
    true_slope = slope,
    true_error = sd_error,
    true_intercept = intercept
  ))

}

Non trending and low error

flat_low <- replicate(n = 10, expr = simulate_data(25, runif(1, 600, 900), runif(1, -.5, .5), sd_error = 10), simplify = F)

flat_low <- bind_rows(flat_low, .id = "rep") %>%
  mutate(type = "flat_low")

ggplot(flat_low, aes(time, value, group = rep)) +
  geom_line() + theme_bw()

Trending and low error

trend_low <- replicate(n = 10, expr = simulate_data(25, runif(1, 600, 900), sample(c(-1, 1), size = 1) * runif(1, 10, 20), sd_error = 10), simplify = F)

trend_low <- bind_rows(trend_low, .id = "rep") %>%
  mutate(type = "trend_low")

ggplot(trend_low, aes(time, value, group = rep)) +
  geom_line() + theme_bw()

Non trending and high error

flat_high <- replicate(n = 10, expr = simulate_data(25, runif(1, 600, 900), runif(1, -.5, .5), sd_error = 50), simplify = F)

flat_high <- bind_rows(flat_high, .id = "rep") %>%
  mutate(type = "flat_high")

ggplot(flat_high, aes(time, value, group = rep)) +
  geom_line() + theme_bw()

Trending and high error

trend_high <- replicate(n = 10, expr = simulate_data(25, runif(1, 600, 900), sample(c(-1, 1), size = 1) * runif(1, 10, 20), sd_error = 50), simplify = F)

trend_high <- bind_rows(trend_high, .id = "rep") %>%
  mutate(type = "trend_high")

ggplot(trend_high, aes(time, value, group = rep)) +
  geom_line() + theme_bw()

All

all_sims <- bind_rows(flat_low, flat_high, trend_low, trend_high)

all_sims <- mutate(all_sims, rep_trend = paste0(rep, type)) 


ggplot(all_sims, aes(time, value, group = rep_trend, color = type)) +
  geom_line() + theme_bw()

lm_fuzz <- function(a_vector) {

  this_ts <- data.frame(time = 1:length(a_vector), value = a_vector)

  this_lm <- lm(value ~ time, this_ts) 

  this_slope <- coefficients(this_lm)[["time"]]

  this_p <- anova(this_lm)[1,5]

  this_r2 <- summary(this_lm)$r.squared

  this_resid <- resid(this_lm)

  this_est <- predict(this_lm)

  mean_est <- mean(this_est)

  resid_est <- abs(this_resid) / this_est

  mean_resid_Est <- mean(resid_est)


  return(data.frame(
    slope = this_slope,
    p = this_p,
    r2 = this_r2,
    mean_est = mean_est,
    mean_resid_est = mean_resid_Est,
    cv = sd(a_vector) / mean(a_vector)
  ))

}


lm_summaries <- list()

for(i in 1:length(unique(all_sims$rep_trend))) {

  this_df <- filter(all_sims, rep_trend == unique(all_sims$rep_trend)[i])

  lm_summaries[[i]] <- lm_fuzz(this_df$value)

  lm_summaries[[i]]$rep_trend = this_df$rep_trend[1]
  lm_summaries[[i]]$type = this_df$type[1]

}

lm_summaries <- bind_rows(lm_summaries)


ggplot(lm_summaries, aes(x = abs(slope) / mean_est, y = mean_resid_est, color = type, shape = p < .05)) +
  geom_point() + theme_bw()
ggplot(lm_summaries, aes(abs(slope), r2, color = type, shape = p < 0.05)) + geom_point() + theme_bw()

ggplot(lm_summaries, aes(cv, mean_resid_est, color = type)) +
  geom_point()

ggplot(lm_summaries, aes(cv, abs(slope), color = type)) +
  geom_point()

lm_summaries <- left_join(lm_summaries, select(all_sims,  true_slope, true_error, true_intercept, rep_trend))

ggplot(lm_summaries, aes(x = slope, y = true_slope, color = type)) +
  geom_point()

Gleanings from these plots:

residuals_est/slope (first one): This successfully breaks out into the different types of simulation I ran. Note that the p-values (shapes) don't distinguish well between flat things with low or high variability.
slope vs r2: this really shows it. The flat ones have bad r2 no matter what, and the trendy ones have high r2. This is because r2 is scaled to the range of variation in data - it's how much you can soak up.
cv vs mean resid: CV becomes difficult to use when there are trends, because the change over time results in high variability. You could maybe do cv once you remove the trend, but that's....not super unlike what I've done.

A really important dimension to this is the scale of the variability. In this case I have twiddled the parameters in the simulations to be (sort of) realistic for abundance trajectories. But we would like to be able to categorize time series in different currencies using this same set of metrics.

diazrenata/BBSstories documentation built on July 6, 2020, 11:59 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com