In karinorman/biodivTS: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(lme4)
library(broom.mixed)
library(lmerTest)
library(emmeans)
library(purrr)

Read in data, get final dataset for models

Get data and put it in long form

#get data
load(here::here("data/meta.rda"))
load(here::here("data/bt_traitfiltered.rda"))
load(here::here("data/rarefied_metrics.rda"))
load(here::here("data/rarefyID_cell_centre.rda"))

source(here::here("R/coords2continent.R"))

meta_clean <- meta %>%
  select(study_id, realm, climate, general_treat, treatment, treat_comments, treat_date, habitat, protected_area, biome_map)


meta_clean <- bt_traitfiltered %>% 
  select(study_id, rarefyid, taxa) %>% distinct() %>%
  left_join(meta_clean) %>%
  mutate(rarefyid_join = stringr::str_extract(rarefyid, "[^_]*_[^_]*")) %>%
  left_join(rarefyID_cell_centre %>% select(-STUDY_ID), by = c("rarefyid_join" = "rarefyID")) %>%
  mutate(country = coords2continent(tibble(rarefyID_x = .$rarefyID_x, rarefyID_y = .$rarefyID_y))) %>%
  select(-c(rarefyid_join, cell_extent, rarefyID_x, rarefyID_y)) %>%
  distinct()

rm(bt_traitfiltered) 

#get metric data
metrics <- rarefied_metrics %>%
  rename_with(tolower, .cols = setdiff(everything(), one_of("rarefyID")))  %>%
  mutate(logvalue = log(value + 1)) %>% # create a column for log transformation that adds the absolute value integer closest to the most negative value to create all positive values
  left_join(meta_clean, c("rarefyID" = "rarefyid"))

## Now the data that actually makes it into the model ##

#get rarefyID's that don't have enough null samples
missing_null <- bind_rows(metrics %>% 
  filter(n_missing_nulls == 1, commplete_null_samps == FALSE),
  metrics %>% filter(n_missing_nulls > 100)) %>%
  pull(rarefyID) %>%
  unique()

model_data <- metrics %>% 
  filter(metric %in% c("SES_FRic", "SES_FEve", "SES_FDiv", "Jaccard_base", "Jaccard_hind",
                       "FRic", "FEve", "FDiv", "S","CWM_bodymass_value","CWM_diet_inv", 
                       "CWM_diet_scav", "CWM_diet_vfish", "CWM_diet_vunk", "CWM_forstrat_ground", 
                       "CWM_forstrat_wataroundsurf", "CWM_forstrat_watbelowsurf", "CWM_diet_fruit", 
                       "CWM_diet_planto", "CWM_diet_seed", "CWM_diet_vend", "CWM_forstrat_understory", 
                       "CWM_diet_vect", "CWM_diet_nect", "CWM_forstrat_aerial", "CWM_forstrat_canopy", 
                       "CWM_forstrat_midhigh", "CWM_age_at_maturity_max_y", "CWM_age_at_maturity_min_y", 
                       "CWM_body_size_mm", "CWM_litter_size_max_n", "CWM_litter_size_min_n", 
                       "CWM_longevity_max_y", "CWM_offspring_size_max_mm", "CWM_offspring_size_min_mm"),
         !rarefyID %in% missing_null) %>%
  group_by(metric) %>%
  mutate(study_id =  str_extract(rarefyID, "[^_]+")) %>%
  mutate(year_scaled = scale(year), 
         scale_center = attributes(year_scaled)$`scaled:center`, 
         scale = attributes(year_scaled)$`scaled:scale`) %>%
  ungroup() %>%
  #this study has many low species richness observations, so FD metrics weren't calculated, just drop it
  filter(study_id != 348) %>%
  #somehow the taxa labels didn't get updated when the timeseries for multi-taxa studies were split up, do that now
   mutate(new_taxa = str_split_fixed(rarefyID, "_", n = 3)[,3],
            taxa = case_when(
            new_taxa == "bird" ~ "Birds",
               new_taxa == "mamm" ~ "Mammals", 
               TRUE ~ as.character(taxa)
           )) %>%
  select(-new_taxa)

usethis::use_data(model_data)
usethis::use_data(meta_clean)

Write out file with all necessary metadata for the supplement.

meta %>% 
  filter(study_id %in% model_data$study_id) %>% 
  select(-c(link_id, date_study_added, sample_desc_name)) %>% 
  write_csv(here::here("paper", "study_metadata.csv"))

Fit GLMs to overall patterns and data broken down by groups

Convergence problems! See here: https://joshua-nugent.github.io/allFit/ https://rstudio-pubs-static.s3.amazonaws.com/33653_57fc7b8e5d484c909b615d8633c01d51.html https://biologyforfun.wordpress.com/2018/04/09/help-i-have-convergence-warnings/

Model checks: https://debruine.github.io/posts/normality/ performance::check_model()

Function approach to model

get_model <- function(metric_name, formula, model_id, data, filter_study_id, bobyqa = FALSE, ...){
  print(c(metric_name, model_id))
  qlmer <- quietly(lmer)
  #print(filter_study_id)

  if (missing(filter_study_id)){
    fit_data <- data %>%
      filter(metric == metric_name) %>%
      mutate(year_scaled = scale(year),
             duration_scaled = scale(duration, scale = FALSE),
             startyear_scaled = scale(startyear, scale = FALSE),
             taxa = as.factor(taxa),
             realm = as.factor(realm),
             climate = as.factor(climate))
  } else{
    print(paste("Filter study", filter_study_id))

    fit_data <- data %>%
      filter(metric == metric_name) %>%
      mutate(year_scaled = scale(year),
             duration_scaled = scale(duration, scale = FALSE),
             startyear_scaled = scale(startyear, scale = FALSE),
             taxa = as.factor(taxa),
             realm = as.factor(realm),
             climate = as.factor(climate)) %>%
      filter(study_id == filter_study_id)

    print(n_distinct(fit_data$study_id))
  }

  print(dim(fit_data))

  # check if there's only one time series
  if(n_distinct(fit_data$rarefyID) < 2){
    return(tibble())
  }

  # # check if there are enough time points to fit
  # year_count <- fit_data %>% count(rarefyID)
  # if(any(year_count$n == 1)){
  #   return(tibble())
  # }
  if(!model_id %in% c("base", "base_dur", "base_srt")){
    if(fit_data %>% pull(model_id) %>% n_distinct() >1) {

      if (isFALSE(bobyqa)){
        fit <- qlmer(formula = formula, data = fit_data, ...)
      } else{
        fit <- qlmer(formula = formula, data = fit_data, control = lmerControl(optimizer = "bobyqa"), ...)
        plain_fit <- lmer(formula = formula, data = fit_data, control = lmerControl(optimizer = "bobyqa"), ...)
      }

      slopes <- get_fixed_slopes(metric = metric_name, model = fit$result, model_id = model_id, model_data = fit_data) 

      group_slopes <- as_tibble(slopes$emtrends) %>%
        mutate(effect = "group_slope", group = model_id) %>%
        rename(std.error = SE, estimate = year_scaled.trend, term = model_id) %>%
        mutate(term = as.character(term))

      slope_constrasts <- as_tibble(slopes$contrasts) %>%
        mutate(effect = "contrast", group = model_id) %>%
        rename(std.error = SE, term = contrast, statistic = t.ratio)

      df <- tidy(fit$result) %>% 
        bind_rows(group_slopes, slope_constrasts) %>%
        mutate(metric = metric_name, 
               model = ifelse(exists("plain_fit"), list(plain_fit), list(fit$result)), model_id = model_id,
               n_ts = n_distinct(fit_data$rarefyID),
               formula = formula,
               warning = ifelse(length(fit$warnings) == 0, NA, fit$warnings))
    }else{
      return(tibble())
    }
  }else{

    if (isFALSE(bobyqa)){
      fit <- qlmer(formula = formula, data = fit_data, ...)
    } else {
      fit <- qlmer(formula = formula, data = fit_data, control = lmerControl(optimizer = "bobyqa"), ...)
      plain_fit <- lmer(formula = formula, data = fit_data, control = lmerControl(optimizer = "bobyqa"), ...)
    }


    if (missing(filter_study_id)){
      df_out <- tidy(fit$result) %>%
        mutate(metric = metric_name, 
               model = ifelse(exists("plain_fit"), list(plain_fit), list(fit$result)), model_id = model_id, 
               n_ts = n_distinct(fit_data$rarefyID),
               formula = formula,
               warning = ifelse(length(fit$warnings) == 0, NA, fit$warnings))
    }else
      df_out <- tidy(fit$result) %>%
        mutate(metric = metric_name, 
               model = ifelse(exists("plain_fit"), list(plain_fit), list(fit$result)), model_id = model_id, 
               n_ts = n_distinct(fit_data$rarefyID),
               formula = formula,
               warning = ifelse(length(fit$warnings) == 0, NA, fit$warnings), 
               study_id = filter_study_id)
  }
}

get_fixed_slopes <- function(metric, model, model_id, model_data){
  formula <- paste("pairwise", "~", model_id)
  fit_trends <- emtrends(model, specs = as.formula(formula), var="year_scaled",  mode = "satterth", lmerTest.limit = 20105, data = model_data)

  return(fit_trends)
}
#get_model(data = model_data, metric_name = "SES_FRic", "value ~ year_scaled + (year_scaled|study_id/rarefyID) + (year_scaled|climate)", model_id = "base")

Function to fit model for studies with only one time series (so a glm)

get_glm_model <- function(metric_name, formula, model_id, data, filter_study_id, ...){
  print(c(metric_name, model_id))
  qglm <- quietly(glm)
  print(filter_study_id)

  fit_data <- data %>%
    filter(metric == metric_name) %>%
    mutate(year_scaled = scale(year),
           duration_scaled = scale(duration, scale = FALSE),
           startyear_scaled = scale(startyear, scale = FALSE),
           taxa = as.factor(taxa),
           realm = as.factor(realm),
           climate = as.factor(climate)) %>%
    filter(study_id == filter_study_id)

  print(n_distinct(fit_data$study_id))

  fit <- qglm(formula = formula, data = fit_data, ...)

  df_out <- tidy(fit$result) %>%
    mutate(metric = metric_name, model = list(fit$result), model_id = model_id, 
           n_ts = n_distinct(fit_data$rarefyID),
           formula = formula,
           warning = ifelse(length(fit$warnings) == 0, NA, fit$warnings), 
           study_id = filter_study_id)

}

Create dataframe of mixed effects models to fit

#exclude amphibians
model_data <- model_data %>% filter(taxa != "Amphibians")

metric_names <- unique(model_data$metric)
raw_metrics <- c("SES_FRic", "SES_FDiv", "SES_FEve", 
                 "CWM_diet_inv", "CWM_forstrat_ground", "CWM_forstrat_watbelowsurf", "FEve")
log_metrics <- c("FRic", "S", "FDiv")
logone_metrics <- metric_names[!metric_names %in% c(raw_metrics, log_metrics)]

raw_value_mods <- tibble(metric_name = raw_metrics, 
                         base = "value ~ year_scaled + (year_scaled|study_id/rarefyID)",
                         taxa = "value ~ year_scaled * taxa + (year_scaled|study_id/rarefyID)",
                         realm = "value ~ year_scaled * realm + (year_scaled|study_id/rarefyID)",
                         climate = "value ~ year_scaled * climate + (year_scaled|study_id/rarefyID)",
                         protected_area = "value ~ year_scaled * protected_area + (year_scaled|study_id/rarefyID)") %>%
  pivot_longer(cols = c(base, taxa, realm, climate, protected_area), names_to = "model_id", values_to = "formula") %>%
  #don't fit the CWM_litter_size_min_n and climate model, not enough data
  filter(metric_name != "CWM_litter_size_min_n" | model_id != "climate")

log_mods <- tibble(metric_name = log_metrics, 
                         base = "log(value) ~ year_scaled + (year_scaled|study_id/rarefyID)",
                         taxa = "log(value) ~ year_scaled * taxa + (year_scaled|study_id/rarefyID)",
                         realm = "log(value) ~ year_scaled * realm + (year_scaled|study_id/rarefyID)",
                         climate = "log(value) ~ year_scaled * climate + (year_scaled|study_id/rarefyID)",
                   protected_area = "log(value) ~ year_scaled * protected_area + (year_scaled|study_id/rarefyID)") %>%
  pivot_longer(cols = c(base, taxa, realm, climate, protected_area), names_to = "model_id", values_to = "formula")

logone_mods <- tibble(metric_name = logone_metrics, 
                         base = "logvalue ~ year_scaled + (year_scaled|study_id/rarefyID)",
                         taxa = "logvalue ~ year_scaled * taxa + (year_scaled|study_id/rarefyID)",
                         realm = "logvalue ~ year_scaled * realm + (year_scaled|study_id/rarefyID)",
                         climate = "logvalue ~ year_scaled * climate + (year_scaled|study_id/rarefyID)",
                      protected_area = "logvalue ~ year_scaled * protected_area + (year_scaled|study_id/rarefyID)") %>%
  pivot_longer(cols = c(base, taxa, realm, climate, protected_area), names_to = "model_id", values_to = "formula")

mods <- bind_rows(raw_value_mods, log_mods, logone_mods) %>% 
  select(metric_name, formula, model_id)

Fit models

#Fit models with base settings
mod_results <- pmap_dfr(mods, get_model, data = model_data) %>%
  mutate(REML = TRUE, bobyqa = FALSE)

#get the models that had convergence warnings
bad_models <- mod_results %>%
  select(metric, model_id, warning) %>% 
  filter(!is.na(warning)) %>% 
  distinct() %>%
  left_join(mods, by = c("metric" = "metric_name", "model_id")) 

#Turn off REML for these models, and they fit! 
bad_mod_results <- pmap_dfr(bad_models %>% select(-warning), get_model, data = model_data, REML = FALSE) %>%
  mutate(REML = FALSE, bobyqa = FALSE)

#models that still won't converge
bad_models2 <- bad_mod_results %>%
  select(metric, model_id, warning) %>% 
  filter(!is.na(warning)) %>% 
  distinct() %>%
  left_join(mods, by = c("metric" = "metric_name", "model_id")) 

#Fit with a different algorithm
bad_mod_results2 <- pmap_dfr(bad_models2 %>% select(-warning), get_model, data = model_data, bobyqa = TRUE) %>%
  mutate(REML = TRUE, bobyqa = TRUE)

model_table <- bind_rows(mod_results %>% filter(is.na(warning)), 
          bad_mod_results %>% filter(is.na(warning)),
          bad_mod_results2 %>% filter(is.na(warning))) %>%
  mutate(significant = case_when(
    p.value > 0.05 | lower.CL < 0 & upper.CL > 0 ~ FALSE,
    !is.na(p.value) | !is.na(lower.CL) ~ TRUE,
    TRUE ~ NA
  )) 

metric_model_table <- model_table %>% select(-model)

usethis::use_data(metric_model_table)

Get list of models that couldn't be fit due to model limitations

#models that were fit
uniq_metric_model <- metric_model_table %>% select(metric, model_id) %>% distinct()
#models we attempted to fit
uniq_mods <- mods %>% select(metric = metric_name, model_id) %>% distinct()

#this is a list of models that weren't fit because there weren't enough timeseries OR they didn't converge
#most traits only found for one taxa with taxa as a covariate, the nectar diet realm model was data limited
missing_mods <- setdiff(uniq_mods, uniq_metric_model)

Examine Outliers

Let's identify any major outliers in slope random effects for the base models to see if they're impacting the overall trend.

First two functions to visualize and exclude outliers

test_normality <- function(model_fit, metric_name){
  intercepts <- ranef(model_fit)$study_id$`(Intercept)`
  slopes <- ranef(model_fit)$study_id$`year_scaled`

  hist(intercepts, main = paste("intercepts for", metric_name))
  hist(slopes, main = paste("slopes for", metric_name))

  bind_rows(tidy(shapiro.test(intercepts)) %>% mutate(type = "intercept"),
            tidy(shapiro.test(slopes)) %>% mutate(type = "slope")) %>%
    mutate(metric = metric_name)
}

check_outliers <- function(start_mod, metric_name, outlier_criterion, lmer_formula, REML = TRUE, bobyqa = FALSE){

  print(metric_name)

  intercepts <- ranef(start_mod)$study_id$`(Intercept)`
  slopes <- ranef(start_mod)$study_id$`year_scaled`

  rows <- which(eval(rlang::parse_expr(outlier_criterion))) # It's row 22
  print(rows)
  if(length(rows) > 1) warning(paste("There is more than one study outside the outlier cutoff for", metric_name))

  study <- as.integer(rownames(ranef(start_mod)$study_id[rows,]))

  if (isTRUE(REML)){
    if (isTRUE(bobyqa)) {
        fit <- lmer(lmer_formula, 
              data = model_data %>% filter(metric == metric_name, study_id != study), 
              control = lmerControl(optimizer = "bobyqa"))
    } else {
      fit <- lmer(lmer_formula, 
              data = model_data %>% filter(metric == metric_name, study_id != study))
    }
  } else{
    if (isTRUE(bobyqa)) {
        fit <- lmer(lmer_formula, 
              data = model_data %>% filter(metric == metric_name, study_id != study), 
              REML = FALSE,
              control = lmerControl(optimizer = "bobyqa"))
    } else {
      fit <- lmer(lmer_formula, 
              data = model_data %>% filter(metric == metric_name, study_id != study),
              REML = FALSE)
    }
  }


  tibble(metric = metric_name,
         old_estimate = tidy(start_mod) %>% filter(term == "year_scaled") %>% pull(estimate), 
         old_p = tidy(start_mod) %>% filter(term == "year_scaled") %>% pull(p.value), 
         new_estimate = tidy(fit) %>% filter(term == "year_scaled") %>% pull(estimate), 
         new_p = tidy(fit) %>% filter(term == "year_scaled") %>% pull(p.value))

}

Visualize the random effects for the base models

base_mods <- model_table %>% 
  filter(model_id == "base") %>%
  select(metric, model, formula, REML, bobyqa) %>%
  group_by(metric) %>% 
  slice_head() %>%
  ungroup()

mod_normality <- map2_dfr(base_mods$model, base_mods$metric, test_normality)

Based on the visualization, we identify some metrics that are suspect for outliers

# create dataframe for outlier criterion based on visualize of random effects
criterion <- tibble(metric_name = c("CWM_bodymass_value", "CWM_diet_inv", "CWM_diet_scav", "CWM_diet_vect", 
                                     "CWM_diet_vend", "CWM_diet_vunk", "CWM_forstrat_aerial", "CWM_forstrat_midhigh",
                                     "CWM_forstrat_understory", "Jaccard_base", "S"),
                    outlier_criterion = c("slopes > 0.4", "slopes < -20", "slopes > 0.6", "slopes > 0.4", 
                                          "slopes > 0.5", "slopes < -0.5", "slopes > 0.04", "slopes < -0.05", 
                                          "slopes < -0.4", "slopes > 0.02", "slopes < -1")
                    )

unique_mod_df <- model_table %>%
  filter(model_id == "base") %>%
  select(metric, model, model_id, formula, REML, bobyqa) %>%
  group_by(metric) %>% 
  slice_head() %>%
  ungroup()


# get info for how original models were fit
outlier_mods <- unique_mod_df %>% 
  select(start_mod = model, metric_name = metric, lmer_formula = formula, REML, bobyqa) %>%
  right_join(criterion) %>%
  select(start_mod, metric_name, outlier_criterion, lmer_formula, REML, bobyqa)

# refit without outliers and compare
outlier_comp <- pmap_dfr(outlier_mods, check_outliers)

Fit individual models

#get list of studies that only have one time series
single_ts <- model_data %>% 
  select(study_id, rarefyID, metric) %>%
  distinct() %>%
  count(study_id, metric) %>%
  filter(n == 1) %>%
  select(-n) %>%
  rename(metric_name = metric, filter_study_id = study_id)

# get dataframe of mixed effects models to fit
metric_df <- model_data %>%
  select(filter_study_id = study_id, metric_name = metric) %>% 
  distinct() %>% 
  mutate(model_id = "base",
         formula = case_when(
           metric_name %in% raw_metrics ~ "value ~ year_scaled + (year_scaled|rarefyID)",
           metric_name %in% log_metrics ~ "log(value) ~ year_scaled + (year_scaled|rarefyID)",
           TRUE ~ "logvalue ~ year_scaled + (year_scaled|rarefyID)"
           )
         )

# exclude studies/metrics with only one rarefyID
mixed_indv_model_df <- metric_df %>%
  anti_join(single_ts, by = c("metric_name", "filter_study_id"))

# fit models for individual studies 
mixed_indv_mod_results <- pmap_dfr(mixed_indv_model_df %>% 
                                     filter(!metric_name %in% c("Jaccard_base", "Jaccard_hind")), 
                                   get_model, data = model_data)

# they didn't all converge
#get the models that had convergence warnings
bad_indv_models <- mixed_indv_mod_results %>%
  select(metric, model_id, study_id, warning) %>% 
  filter(!is.na(warning)) %>% 
  distinct() %>%
  left_join(mixed_indv_model_df, 
            by = c("metric" = "metric_name", "model_id", "study_id" = "filter_study_id")) %>%
  rename( filter_study_id = study_id)

#Turn off REML for these models, and they fit! 
bad_indv_results <- pmap_dfr(bad_indv_models %>% select(-warning), get_model, data = model_data, REML = FALSE) %>%
  mutate(REML = FALSE, bobyqa = FALSE)

#models that still won't converge
bad_indv_models2 <- bad_indv_results %>%
  select(metric, model_id, warning, study_id) %>% 
  filter(!is.na(warning)) %>% 
  distinct() %>%
  left_join(mixed_indv_model_df, 
            by = c("metric" = "metric_name", "model_id", "study_id" = "filter_study_id")) %>%
  rename( filter_study_id = study_id)

#Fit with a different algorithm
bad_indv_results2 <- pmap_dfr(bad_indv_models2 %>% select(-warning), get_model, data = model_data, control = lmerControl(optimizer = "bobyqa")) %>%
  mutate(REML = TRUE, bobyqa = TRUE)

There are 13 study/metric combos that can't be fit due to data limitations (too many rarefyID's )

tofit_mods <- bad_indv_results2 %>% filter(!is.na(warning)) %>% select(metric, study_id, formula) %>% distinct()

missing_indv_studies <- pmap_df(tofit_mods %>% select(metric_name = metric, filter_study_id = study_id), function(metric_name, filter_study_id) {model_data %>%
         filter(metric_name == metric, study_id == filter_study_id) %>%
        mutate(n = n(), nrarefyID = n_distinct(rarefyID)) %>%
        select(metric, study_id, nrarefyID, n) %>%
        distinct()})

readr::write_csv(missing_indv_studies, here::here("paper/missing_indv_studies.csv"))

Fit Jaccard Mixed models to see which will fit

mixed_jacc_mod_results <- pmap_dfr(mixed_indv_model_df %>% filter(metric_name == "Jaccard_base"), 
                                   possibly(get_model, otherwise = tibble()), data = model_data)

non_conv_study <- mixed_jacc_mod_results %>% 
  filter(!is.na(warning)) %>% pull(study_id) %>% unique()

mixed_jacc_mod_results2 <- pmap_dfr(mixed_indv_model_df %>% filter(metric_name == "Jaccard_base", filter_study_id %in% non_conv_study), 
                                    possibly(get_model, otherwise = tibble()), data = model_data, bobyqa = TRUE)

mixed_jacc_mod_results_fin <- bind_rows(mixed_jacc_mod_results %>% filter(!study_id %in% non_conv_study),
                                    mixed_jacc_mod_results2)

Fit models for studies with only one time series

#get dataframe of glm's to fit
glm_indv_model_df <- single_ts %>% 
  mutate(model_id = "base",
         formula = case_when(
           metric_name %in% raw_metrics ~ "value ~ year_scaled",
           metric_name %in% log_metrics ~ "log(value) ~ year_scaled",
           TRUE ~ "logvalue ~ year_scaled"
           )
         )

#fit them
glm_indv_mod_results <- pmap_dfr(glm_indv_model_df, get_glm_model, data = model_data)

indv_mod_results <- bind_rows(mixed_indv_mod_results %>% filter(is.na(warning)), 
          bad_indv_results %>% filter(is.na(warning)),
          bad_indv_results2 %>% filter(is.na(warning)),
          glm_indv_mod_results,
          mixed_jacc_mod_results_fin
          )

indv_mod_table <- indv_mod_results %>% select(-model)
usethis::use_data(indv_mod_table)

Sensitivity analysis for duration and trait coverage

Min time series duration

dur_dfs <- list(min3 = filter(model_data, duration > 2),
                min4 = filter(model_data, duration > 3),
                min5 = filter(model_data, duration > 4),
                min10 = filter(model_data, duration > 9))

duration_sens <- map_dfr(dur_dfs, ~pmap_dfr(filter(mods, model_id == "base"), get_model, data = .x), .id = "dataID")

# first group of bad models, try with REML off
bad_dur_models <- duration_sens %>%
  select(metric, model_id, warning, dataID) %>% 
  filter(!is.na(warning)) %>% 
  distinct() 

min3_bad <- bad_dur_models %>% filter(dataID == "min3") %>% pull(metric)
min4_bad <- bad_dur_models %>% filter(dataID == "min4") %>% pull(metric)
min5_bad <- bad_dur_models %>% filter(dataID == "min5") %>% pull(metric)
min10_bad <- bad_dur_models %>% filter(dataID == "min10") %>% pull(metric)

refit_dur <- bind_rows(pmap_dfr(filter(mods, model_id == "base", metric_name %in% min3_bad), get_model, data = dur_dfs$min3, REML = FALSE) %>% mutate(dataID = "min3"),
                       pmap_dfr(filter(mods, model_id == "base", metric_name %in% min4_bad), get_model, data = dur_dfs$min4, REML = FALSE) %>% mutate(dataID = "min4"),
                       pmap_dfr(filter(mods, model_id == "base", metric_name %in% min5_bad), get_model, data = dur_dfs$min5, REML = FALSE) %>% mutate(dataID = "min5"),
                       pmap_dfr(filter(mods, model_id == "base", metric_name %in% min10_bad), get_model, data = dur_dfs$min10, REML = FALSE) %>% mutate(dataID = "min10")
)

# two left that didn't converge
bad_dur_models2 <- refit_dur %>%
  select(metric, model_id, warning, dataID) %>% 
  filter(!is.na(warning)) %>% 
  distinct() 

refit_dur_2 <- bind_rows(pmap_dfr(filter(mods, model_id == "base", metric_name == "CWM_diet_planto"), get_model, data = dur_dfs$min3, bobyqa = TRUE) %>% mutate(dataID = "min3"),
                       pmap_dfr(filter(mods, model_id == "base", metric_name == "CWM_forstrat_midhigh"), get_model, data = dur_dfs$min5, bobyqa = TRUE) %>% mutate(dataID = "min5"))


duration_sens_mods <- bind_rows(duration_sens %>% filter(is.na(warning)),
                                refit_dur %>% filter(is.na(warning)),
                                refit_dur_2 %>% filter(is.na(warning)))

Trait Coverage

load(here::here("data/study_table.rda"))

model_data <- model_data %>%
  left_join(study_table %>% select(study_id, coverage) %>% mutate(study_id = as.factor(study_id)))

covg_dfs <- list(covg75 = filter(model_data, taxa != "Amphibians"),
                covg85 = filter(model_data, taxa != "Amphibians", coverage >= 0.85),
                covg90 = filter(model_data, taxa != "Amphibians", coverage >= 0.90))

coverage_sens <- map_dfr(covg_dfs, ~pmap_dfr(filter(mods, model_id == "base"), 
                                             get_model, data = .x), .id = "dataID")

# first group of bad models, try with REML off
bad_cov_models <- coverage_sens %>%
  select(metric, model_id, warning, dataID) %>% 
  filter(!is.na(warning)) %>% 
  distinct() 

covg75_bad <- bad_cov_models %>% filter(dataID == "covg75") %>% pull(metric)
covg85_bad <- bad_cov_models %>% filter(dataID == "covg85") %>% pull(metric)
covg90_bad <- bad_cov_models %>% filter(dataID == "covg90") %>% pull(metric)

refit_cov <- bind_rows(pmap_dfr(filter(mods, model_id == "base", metric_name %in% covg75_bad), 
                                get_model, data = covg_dfs$covg75, REML = FALSE) %>% mutate(dataID = "covg75"),
                       pmap_dfr(filter(mods, model_id == "base", metric_name %in% covg85_bad), 
                                get_model, data = covg_dfs$covg85, REML = FALSE) %>% mutate(dataID = "covg85"),
                       pmap_dfr(filter(mods, model_id == "base", metric_name %in% covg90_bad), 
                                get_model, data = covg_dfs$covg90, REML = FALSE) %>% mutate(dataID = "covg90")
)

coverage_sens_mods <- bind_rows(coverage_sens %>% filter(is.na(warning)),
                                refit_cov %>% filter(is.na(warning)))

sens_mods <- bind_rows(duration_sens_mods, coverage_sens_mods) %>%
  select(-model, -warning) %>%
  filter(metric != "Jaccard_next")

Perform p-value correction

# Let's get a dataframe of p-values for the models we actually used

# base models for all metrics

base <- metric_model_table %>%
  filter(model_id == "base", term == "year_scaled",
         metric != "Jaccard_hind")


# within group and between group trends for all metrics

contrast <- metric_model_table %>%
  filter(model_id != "base", effect == "contrast")

group_slope <-  metric_model_table %>%
  filter(model_id != "base", effect == "group_slope") %>%
  mutate(tval = estimate/std.error,
         p.value = 2*pt(-abs(tval), df))

# study-level trends for species and functional diversity metrics

study_trend <- indv_mod_table %>%
  filter(term == "year_scaled")

# sensitivity models
sens_trends <- sens_mods %>%
  filter(term == "year_scaled")

pval_df <- bind_rows(base, contrast, group_slope, study_trend, sens_trends) %>%
  filter(metric != "Jaccard_next")

adj.p <- p.adjust(pval_df$p.value, method = "BH")

pval_df <- bind_cols(pval_df, p.adjust = adj.p)

# # Which of the general model trends are likely spurious
# pval_df %>% 
#   filter(p.value < 0.05, p.adjust > 0.05) %>% 
#   filter(is.na(study_id)) %>% 
#   View()
# 
# # Which of the study-level trends are likely spurious
# pval_df %>% 
#   filter(p.value < 0.05, p.adjust > 0.05) %>% 
#   filter(!is.na(study_id)) %>% 
#   View()

# add adjusted p value to model table

model_output <- bind_rows(metric_model_table, indv_mod_table) %>%
  select(-p.value) %>% 
  left_join(pval_df) %>%
  select(-significant, -tval)

sensitivity_output <- sens_mods %>%
  select(-p.value) %>%
  left_join(pval_df %>% select(p.value, p.adjust, effect, term, model_id, dataID, metric), na_matches = "never") %>%
  rename(sens_level = dataID) 

usethis::use_data(model_output)
usethis::use_data(sensitivity_output)

write_csv(sensitivity_output, here::here("paper", "sensitivity_models.csv"))

Write out table of model estimates for manuscript

#manuscript table
metric_model_table %>% 
  filter(model_id == "base", metric %in% c("S", "Jaccard_base", "SES_FRic", "SES_FEve", "SES_FDiv")) %>% 
  select(-c(model_id, warning, statistic, df)) %>% 
  readr::write_csv(here::here("figures", "man_model_table.csv"))

#supplement of complete model output
model_output %>%
    select(-c(n_ts, warning, lower.CL, upper.CL, REML, bobyqa)) %>%
  readr::write_csv(here::here("paper", "model_estimates_supp.csv"))

Analizing slopes

Is slope a function of time series characteristics?

library(multcomp)

study_time_data <- model_data %>% 
  dplyr::select(study_id, startyear, duration) %>% 
  distinct() %>%
  group_by(study_id) %>%
  mutate(startyear = min(startyear), duration = max(duration)) %>%
  distinct()

study_slope_data <- model_output %>%
  filter(!is.na(study_id), term == "year_scaled") %>%
  dplyr::select(study_id, metric, estimate) %>%
  left_join(study_time_data)

duration_slope <- study_slope_data %>%
  group_nest(metric) %>%
  mutate(model = purrr::map(data, ~glm(estimate ~ duration, 
                                 data = .x)),
           coef = purrr::map(model, tidy),
           ) %>% 
  dplyr::select(metric, coef) %>%
  unnest(cols = c(coef)) %>%
  mutate(covariate = "duration")

startyear_slope <- study_slope_data %>%
  group_nest(metric) %>%
  mutate(model = purrr::map(data, ~glm(estimate ~ startyear, 
                                 data = .x)),
           coef = purrr::map(model, tidy),
           ) %>% 
  dplyr::select(metric, coef) %>%
  unnest(cols = c(coef)) %>%
  mutate(covariate = "startyear")

#All model estimates in one dataframe
slope_models <- bind_rows(duration_slope, startyear_slope)

# Which predictor variables were significant?
slope_models %>% filter(term != "(Intercept)", p.value < 0.05) %>% View()