knitr::opts_chunk$set(echo = FALSE)
library(tibble)
library(dplyr)
library(MCMCpack)
library(mclust)
library(corrplot)
library(coda)
library(runjags)
library(ggplot2)
library(patchwork)
library(msod)
varname2type <- function(varnames){
  types <- case_when(
    grepl("lv.coef", varnames) ~ "LV Load",
    grepl("LV", varnames) ~ "LV",
    grepl("^(mu|tau|sigma)", varnames) ~ "Comm Param", #parameters of community distributions
    grepl("^u.b", varnames) ~ "Occu Coef",
    grepl("^v.b", varnames) ~ "Detn Coef",
    TRUE ~ "other"
    )
  return(types)
}

Data Import

modelspecs <- readRDS("./7_3_03_modelspecs.rds")
filenames <- lapply(modelspecs, function(x) x$filename )

# test loading models
a <- vapply(filenames, file.exists, FUN.VALUE = FALSE)
stopifnot(all(a))

# load and remove crosscorrelation
fittedmods <- lapply(filenames, function(x) {
  fit <- readRDS(x)
  return(fit)})
inputdata <- readRDS(readLines("../link_7_2_10_input_data.txt")[[1]])
# the following whittle out the covariates that we are interested in the residual diagnostics
detcovar <- model.matrix(~ ModelSiteID + MeanWind +  MeanTime + MeanClouds + MeanTemp + ObserverId - 1,
             data = inputdata$insampledata$yXobs) %>%
  as_tibble() %>% rename(ModelSite = ModelSiteID)
occcovar <- model.matrix(as.formula(modelspecs[[1]]$OccFmla),
             data = inputdata$insampledata$Xocc) %>%
  as_tibble() %>%
  mutate(ModelSite = inputdata$insampledata$Xocc$ModelSiteID,
         StudyId = inputdata$insampledata$Xocc$StudyId,
         latitude = inputdata$insampledata$Xocc$latitude,
         SurveyYear = inputdata$insampledata$Xocc$SurveyYear) %>%
  dplyr::select(-`(Intercept)`)
treatfactor_occ <- occcovar %>%
  summarise_all(~ n_distinct(.)) %>%
  tidyr::pivot_longer(everything(), names_to = "CovarName") %>%
  dplyr::filter(value <= 10) %>%
  dplyr::select(CovarName) %>%
  unlist()
treatfactor_det <- detcovar %>%
  summarise_all(~ n_distinct(.)) %>%
  tidyr::pivot_longer(everything(), names_to = "CovarName") %>%
  dplyr::filter(value <= 10) %>%
  dplyr::select(CovarName) %>%
  unlist()
treatfactor = c(treatfactor_occ, treatfactor_det)
lpds_l <- readRDS("./fittedmodels/7_3_03_lpds.rds")
waics_l <- readRDS("./fittedmodels/7_3_03_waics.rds")
Enums_holdout <- readRDS("./fittedmodels/7_3_03_many_Enum_holdout.rds")
Enums_insample <- readRDS("./fittedmodels/7_3_03_many_Enum_insample_margLV.rds")
Enums_insample_condLV <- readRDS("./fittedmodels/7_3_03_many_Enum_insample_condLV.rds")

names(lpds_l) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(lpds_l))
names(waics_l) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(waics_l))
names(Enums_holdout) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(Enums_holdout))
names(Enums_insample) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(Enums_insample))
names(Enums_insample_condLV) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(Enums_insample_condLV))
cleanersummary <- function(fit) {
  vals <- fit$summaries %>%
    as_tibble(rownames = "bugsvarname") %>%
    rename_with(function(x) "AC_10", starts_with("AC")) %>%
    mutate(varname = gsub("\\[.*\\]", "", bugsvarname)) %>%
    mutate(type = varname2type(bugsvarname))
  suppressWarnings(vals <- vals %>%
    mutate(SpeciesIdx = as.integer(gsub("(.*\\[|,.*\\])", "", bugsvarname))) %>%
    mutate(CovarIdx = as.integer(gsub("(.*\\[.*,|\\])", "", bugsvarname))) )
  vals <- vals %>%
    mutate(Species = case_when(
      varname == "LV" ~ as.character(NA),
      TRUE ~ fit$species[SpeciesIdx])) %>%
    mutate(ModelSite = case_when(
      varname == "LV" ~ SpeciesIdx,
      TRUE ~ NA_integer_)) %>%
    mutate(Covariate = case_when(
      varname == "u.b" ~ colnames(fit$data$Xocc)[CovarIdx],
      varname == "v.b" ~ colnames(fit$data$Xobs)[CovarIdx],
      varname == "LV" ~ paste0("LV", as.character(CovarIdx)),
      varname == "lv.coef" ~ paste0("LV", as.character(CovarIdx)),
      TRUE ~ as.character(NA)
    )) %>% 
    dplyr::select(-SpeciesIdx, -CovarIdx)
  return(vals)
}
cleanersummary_l <- lapply(fittedmods, cleanersummary)
cleanersummaries <- bind_rows(cleanersummary_l, .id = "Model")
cat("MCMC time:\n")
lapply(fittedmods, function(x) {
  if (!is.null(x$timetaken)) {return(runjags::timestring(as.numeric(x$timetaken, units="secs")))}
  else return(NULL)})

Comment on time taken for the MCMC of each model:

MCMC Assessment

Multi Chain Gelman-Rubin Statistic (aka "Rhat" or psrf)

Values less than 1.1 are desired.

cleanersummaries %>%
  mutate(type = varname2type(varname)) %>%
  ggplot() +
  facet_grid(rows = vars(type), cols = vars(Model), scales = "free") +
  geom_point(aes(y = bugsvarname, x = psrf) ) +
  geom_vline(xintercept = 1.1, col = "blue", lty = "dashed") +
  scale_y_discrete(name = "Variable Name") +
  theme(strip.text.x.top = element_text(angle = 90)) +
  scale_x_continuous(name = "Gelman-Rubin Statistic")

Comment on convergence using the Gelman-Rubin statistic

Effective Sample Size

Effective sample size should be similar order of magnitude to the number of MCMC samples.

goodthresh <- fittedmods[[1]]$sample * length(fittedmods[[1]]$mcmc) / 4
plt1 <- cleanersummaries %>%
  mutate(type = varname2type(varname)) %>%
  ggplot() +
  facet_grid(rows = vars(type), cols = vars(Model), scales = "free") +
  geom_point(aes(y = bugsvarname, x = SSeff, col = SSeff < goodthresh) ) +
  geom_vline(xintercept = goodthresh, col = "red", lty = "dashed") +
  scale_y_discrete(name = "Variable Name") +
  theme(strip.text.x.top = element_text(angle = 90)) +
  scale_x_continuous(name = "Effective Sample Size", trans = "log10")

plt2 <- cleanersummaries %>%
  mutate(type = varname2type(varname)) %>%
  ggplot() +
  facet_grid(rows = vars(type), cols = vars(Model), scales = "free") +
  geom_histogram(aes(x = SSeff), bins = 50 ) +
  geom_vline(xintercept = goodthresh, col = "red", lty = "dashed") +
  scale_y_discrete(name = "Variable Name") +
  theme(strip.text.x.top = element_blank()) +
  scale_x_continuous(name = "Effective Sample Size", trans = "log10")

plt1 / plt2

Geweke (Convergence)

See http://www.ugrad.stat.ubc.ca/R/library/coda/html/geweke.diag.html for a very quick description of this. The Geweke values are Z-scores (technically t-distributed in this situation?). 95% of (independent) Geweke values should be within 2 standard deviations (i.e. just 2 for Z-scores) of the mean.

gwk <- bind_rows(lapply(fittedmods, 
       function(x) enframe(geweke.diag(x, frac1=0.1, frac2=0.5)$z, name = "varname")),
       .id = "Model")


plt1 <- gwk %>%
  mutate(type = varname2type(varname)) %>%
  ggplot() +
  facet_grid(rows = vars(type), cols = vars(Model), scales = "free") +
  geom_point(aes(y = varname, x = value, col = abs(value) < 2) ) +
  geom_vline(xintercept = c(-2, 2), col = "blue", lty = "dashed") +
  scale_y_discrete(name = "Variable Name") +
  theme(strip.text.x.top = element_text(angle = 90)) +
  scale_x_continuous(name = NULL)

plt2 <- gwk %>%
  mutate(type = varname2type(varname)) %>%
  ggplot() +
  facet_grid(rows = vars(type), cols = vars(Model), scales = "free") +
  geom_histogram(aes(x = value), bins = 30 ) +
  geom_vline(xintercept = c(-2, 2), col = "blue", lty = "dashed") +
  scale_y_continuous(name = "Variable Name") +
  theme(strip.text.x.top = element_blank()) +
  scale_x_continuous(name = "Geweke Diagnostic Statistics")

plt1 / plt2

Conclusions

Summarise behaviour of the MCMC chains

Compare Covariate Loadings

cleanersummaries %>%
  dplyr::filter(varname == "u.b") %>%
  dplyr::mutate(ci95_misses_0 = !((Lower95 < 0) & (Upper95 > 0))) %>%
  ggplot() +
  facet_wrap(vars(Covariate), nrow = 1) +
  geom_hline(yintercept = 0, col = "blue", lty = "dashed") +
  geom_pointrange(aes(x = Species,
                 ymin = Lower95,
                 ymax = Upper95,
                 y = Median,
                 col = Model,
                 alpha = ci95_misses_0,
                 group = Model),
             shape = "|",
             size = 1,
             position = position_dodge(width = 1)) +
  scale_y_continuous(name = "Posterior Estimate") +
  coord_flip() +
  scale_color_viridis_d() +
  theme(strip.text.x.top = element_text(angle = 90)) +
  ggtitle("Occupancy Covariate Loadings per Species and Model")
cleanersummaries %>%
  dplyr::filter(varname == "v.b") %>%
  dplyr::mutate(ci95_misses_0 = !((Lower95 < 0) & (Upper95 > 0))) %>%
  ggplot() +
  facet_wrap(vars(Covariate), nrow = 1) +
  geom_hline(yintercept = 0, col = "blue", lty = "dashed") +
  geom_pointrange(aes(x = Species,
                 ymin = Lower95,
                 ymax = Upper95,
                 y = Median,
                 col = Model,
                 alpha = ci95_misses_0,
                 group = Model),
             shape = "|",
             size = 1,
             position = position_dodge(width = 1)) +
  scale_y_continuous(name = "Posterior Estimate") +
  coord_flip() +
  scale_color_viridis_d() +
  theme(strip.text.x.top = element_text(angle = 90)) +
  ggtitle("Detection Covariate Loadings per Species and Model")

LV Loadings

cleanersummaries %>%
  dplyr::filter(varname == "lv.coef") %>%
  ggplot() +
  facet_wrap(vars(Covariate), nrow = 1) +
  geom_hline(yintercept = 0) +
  geom_pointrange(aes(x = Species,
                 ymin = Lower95,
                 ymax = Upper95,
                 y = Median,
                 col = Model,
                 group = Model),
             shape = "|",
             size = 1,
             position = position_dodge(width = 1)) +
  coord_flip() +
  scale_color_viridis_d() +
  theme(strip.text.x.top = element_text(angle = 90)) +
  ggtitle("LV Loadings per Species and Model")

Log Posterior Density

LPD of Holdout Data

lpds_df <- as_tibble(do.call(cbind, lapply(fittedmods,
                                           function(x) x$quality$holdout$lpd$lpds)))
melpd <- data.frame(Estimate = apply(lpds_df, 2, mean),
                    SE = apply(lpds_df, 2, sd) / sqrt(nrow(lpds_df)))

lpds_df %>%
  as_tibble() %>%
  rowid_to_column(var = "HoldOutModelSite") %>%
  tidyr::pivot_longer(-HoldOutModelSite, names_to = "model", values_to = "Site_lpd") %>%
  ggplot() +
  # facet_grid(rows = vars(model), scales = "free_y") +
  geom_violin(aes(x = Site_lpd, y = model), adjust = 0.2) +
  stat_summary(aes(x = Site_lpd, y = model),
               fun.data=mean_se, fun.args = list(mult=2), geom="crossbar", width=0.2 ) +
  stat_summary(aes(x = Site_lpd, y = model),
               fun = median, geom = "point", shape = 23, size = 2) +
  ggtitle("Log-Posterior Density of Holdout ModelSites")

LPD of In-Sample Data

Below is the lpd computed for each ModelSite for both the InSample and out of sample data.

insamplelpds_df <- do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, "elpd_waic"])) %>%
  as_tibble() %>% mutate(InSample = TRUE)

df <- lpds_df %>% as_tibble() %>% mutate(InSample = FALSE) # %>% tidyr::pivot_longer(-InSample, names_to = "model", values_to = "site_lpd")

df <- bind_rows(insamplelpds_df, df)


df %>%
  as_tibble() %>%
  tidyr::pivot_longer(-InSample, names_to = "model", values_to = "site_lpd") %>%
  ggplot() +
  facet_grid(rows = vars(model), scales = "free_y", switch = "y") +
  geom_violin(aes(x = site_lpd, y = InSample, col = InSample), adjust = 0.2, position = position_dodge(1)) +
  stat_summary(aes(x = site_lpd, y = InSample, group = InSample),
               fun.data=mean_se, fun.args = list(mult=2), geom="crossbar", width=0.2 ) +
  theme(strip.text.y.left = element_text(angle = 0)) +
  ggtitle("Log Posterior Density of ModelSites")

WAIC, LOO-PSIS and Holdout

waics_average_elpd_per_point <- lapply(fittedmods, function(x) x$quality$insample$waic$estimates["elpd_waic", ]/nrow(x$quality$insample$waic$pointwise))
waics_average_elpd_per_point <- simplify2array(waics_average_elpd_per_point)
waics_average_elpd_per_point <- t(waics_average_elpd_per_point) %>% as_tibble(rownames = "Model")
waics_average_elpd_per_point$type = "WAIC"

loo_average_elpd_per_point <- lapply(fittedmods, function(x) x$quality$insample$loo$estimates["elpd_loo", ]/nrow(x$quality$insample$loo$pointwise))
loo_average_elpd_per_point <- simplify2array(loo_average_elpd_per_point)
loo_average_elpd_per_point <- t(loo_average_elpd_per_point) %>% as_tibble(rownames = "Model")
loo_average_elpd_per_point$type = "LOO-PSIS"

melpd$type <- "Holdout"
melpd <- as_tibble(melpd, rownames = "Model")

df <- bind_rows(waics_average_elpd_per_point, loo_average_elpd_per_point, melpd)

df %>%
  ggplot() +
  geom_errorbar(aes(x = Model, ymax = Estimate +  2*SE, ymin = Estimate - 2*SE, col = type, lty = type)) +
  geom_point(aes(x = Model, y = Estimate, col = type), position = position_jitter(width = 0.1)) +
  coord_flip() +
  ggtitle("Estimates of Log Posterior Density for a new ModelSite")

Diagnostics from LOO-PSIS

ModelSites which diagnositics suggest have a bad WAIC estimate or loo-psis estimate are investigated below:

lpds_df_diagnose <- bind_rows(lapply(fittedmods, 
                                    function(x) {
                                      waicsvals <- as_tibble(x$quality$insample$waic$pointwise[, c("elpd_waic", "p_waic")])
                                      loovals <- as_tibble(x$quality$insample$loo$pointwise[, c("elpd_loo", "p_loo")])
                                      loodiagnose <- as_tibble(x$quality$insample$loo$diagnostics)
                                      return(cbind(ModelSite = 1:nrow(waicsvals), waicsvals, loovals, loodiagnose))
                                      }),
                             .id = "model") 

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  mutate(is_ok = factor(is_ok)) %>%
  group_by(model) %>%
  count(is_ok) %>%
  tidyr::pivot_wider(names_from = is_ok,
              values_from = n)

lpds_df_diagnose %>%
  ggplot() +
  facet_grid(rows = vars(model), scales = "free_y", switch = NULL) +
  geom_point(aes(x = elpd_waic, y = p_waic, col = p_waic < 0.4)) +
  geom_hline(yintercept = 0.4, col = "blue") + #see [1]A. Vehtari, A. Gelman, and J. Gabry, "Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC," Stat Comput, vol. 27, pp. 1413-1432, Sep. 2017
  # geom_rug(aes(x = elpd_waic),
  #          data = function(x) dplyr::filter(x, p_waic > 0.4),
  #          sides = "t") +
  theme(strip.text.y.right = element_text(angle = 0)) +
  scale_y_continuous(name = "p_waic", trans = "log") +
  ggtitle("Diagnostics of WAIC")

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  ggplot() +
  facet_grid(rows = vars(model), switch = NULL) +
  geom_point(aes(x = ModelSite, y = pareto_k, col = is_ok)) +
  geom_hline(yintercept = 0.5, col = "blue") + #see [1]A. Vehtari, A. Gelman, and J. Gabry, "Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC," Stat Comput, vol. 27, pp. 1413-1432, Sep. 2017
  geom_hline(yintercept = 0.7, col = "blue") + #see [1]A. Vehtari, A. Gelman, and J. Gabry, "Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC," Stat Comput, vol. 27, pp. 1413-1432, Sep. 2017
  theme(strip.text.y.right = element_text(angle = 0)) +
  ggtitle("Diagnostics of LOO-PSIS: Pareto_k for ModelSites")

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  dplyr::filter(is_ok == "bad") %>%
  ggplot() +
  facet_grid(rows = vars(model), switch = NULL) +
  geom_point(aes(x = p_loo, y = pareto_k)) +
  theme(strip.text.y.right = element_text(angle = 0)) +
  ggtitle("Diagnostics of LOO-PSIS: p_loo of pareto_k for ModelSites")

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  dplyr::filter(is_ok == "bad") %>%
  # inner_join(detcovar, by = "ModelSite") %>%
  inner_join(occcovar, by = "ModelSite") %>%
  dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>%
  tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>%
  ggplot() +
  facet_wrap(vars(Covariate), scales = "free") +
  theme(strip.text.y.right = element_text(angle = 0)) +
  geom_point(aes(x = CovariateValue, y = factor(ModelSite), col = model),
             shape = "+", position = "jitter") +
  scale_x_continuous(trans = scales::modulus_trans(0))

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  dplyr::filter(is_ok == "bad") %>%
  inner_join(detcovar, by = "ModelSite") %>%
  # inner_join(occcovar, by = "ModelSite") %>%
  dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>%
  tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>%
  ggplot() +
  facet_wrap(vars(Covariate), scales = "free") +
  theme(strip.text.y.right = element_text(angle = 0)) +
  geom_point(aes(x = CovariateValue, y = factor(ModelSite), col = model),
             shape = "+", position = "jitter") +
  scale_x_continuous(trans = scales::modulus_trans(0))

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  dplyr::filter(is_ok == "bad") %>%
  # inner_join(detcovar, by = "ModelSite") %>%
  inner_join(occcovar, by = "ModelSite") %>%
  dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>%
  tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>%
  ggplot() +
  facet_wrap(vars(Covariate), scales = "free") +
  theme(strip.text.y.right = element_text(angle = 0)) +
  geom_freqpoly(aes(x = CovariateValue, y = ..density.., lty = model, col = model), position = "stack") +
  geom_freqpoly(aes(x = CovariateValue, y = ..density..),
                data = tidyr::pivot_longer(occcovar, c(-ModelSite), names_to = "Covariate", values_to = "CovariateValue"),
                col = "black") +
  scale_x_continuous(trans = scales::modulus_trans(0))

lpds_df_diagnose %>%
  mutate(is_ok = case_when(
    pareto_k < 0.5 ~ "good",
    pareto_k < 0.7 ~ "ok",
    pareto_k >= 0.7 ~ "bad"
  )) %>% 
  dplyr::filter(is_ok == "bad") %>%
  inner_join(detcovar, by = "ModelSite") %>%
  dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>%
  tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>%
  ggplot() +
  facet_wrap(vars(Covariate), scales = "free") +
  theme(strip.text.y.right = element_text(angle = 0)) +
  geom_freqpoly(aes(x = CovariateValue, y = ..density.., lty = model, col = model), position = "stack") +
  geom_freqpoly(aes(x = CovariateValue, y = ..density..),
                data = tidyr::pivot_longer(detcovar, c(-ModelSite), names_to = "Covariate", values_to = "CovariateValue"),
                col = "black") +
  scale_x_continuous(trans = scales::modulus_trans(0))

According to help page 'loo-glossary', if p_loo << [total parameters in the model], then the model is likely misspecified.

Model Difference, lpd per site

elpd_compare(lpds_df)
elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, 1])))
elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$loo$pointwise[, 1])))

plot_compare_loo <- function(compare.loo.obj){
  plt <- compare.loo.obj %>%
    as_tibble(rownames = "Model") %>%
    ggplot() +
    geom_errorbarh(aes(xmin = elpd_diff - 2 * se_diff, xmax = elpd_diff + 2 * se_diff,
                       y = Model)) +
    geom_point(aes(x = elpd_diff, y = Model)) +
    geom_vline(xintercept = 0, col = "blue") +
    scale_x_continuous("Pointwise Difference in Expected Log Posterior Density")
  return(plt)
}

elpd_compare(lpds_df) %>%
  plot_compare_loo() +
  ggtitle("Holdout elpd Differences")

elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, 1]))) %>%
  plot_compare_loo() +
  ggtitle("WAICS Differences")

elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, 1]))) %>%
  plot_compare_loo() +
  ggtitle("LOO-PSIS Differences")

Biodiversity Accuracy

Holdout

obsnumbers <- detectednumspec(inputdata$holdoutdata$yXobs[, inputdata$species], 
                              inputdata$holdoutdata$yXobs[, "ModelSiteID", drop = TRUE])
Enum_det <- do.call(cbind, lapply(fittedmods, function(x)
  x$quality$holdout$predspecnum["Esum_det", , drop = TRUE]))
Vnum_det <- do.call(cbind, lapply(fittedmods, function(x)
  x$quality$holdout$predspecnum["Vsum_det", , drop = TRUE]))

Enum_compare(obsnumbers, Enum_det, Vnum_det)
meanplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>%
  as_tibble(rownames = "Model") %>%
  tidyr::pivot_longer(starts_with("SE"), names_to = "SEtype", values_to = "SE") %>%
  ggplot() +
  geom_vline(xintercept = 0, col = "blue") +
  geom_errorbarh(aes(xmin = `E[D]_obs` - 2 * SE, xmax = `E[D]_obs` + 2 * SE,
                   y = Model,
                   col = SEtype, lty = SEtype)) +
  geom_point(aes(x = `E[D]_obs`, y = Model)) +
  scale_x_continuous("Mean Residual") +
  ggtitle("Mean Residual of Number of Detected Spccies")

varplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>%
  as_tibble(rownames = "Model") %>%
  tidyr::pivot_longer(starts_with("V"), names_to = "Vtype", values_to = "Variance") %>%
  ggplot() +
  geom_point(aes(x = sqrt(Variance), y = Model, col = Vtype, shape = Vtype)) +
  scale_x_continuous("Standard Deviation of Residual") +
  theme(legend.position = "bottom") +
  ggtitle("Standard Deviation of Number of Detected Species")

coverageplt <- Enum_coverage(obsnumbers, Enum_det, Vnum_det)[["mean"]] %>%
  tibble::enframe(name = "Model", value = "Pcnt. Coverage") %>%
  ggplot() +
  geom_point(aes(x = `Pcnt. Coverage`, y = Model)) +
  geom_vline(xintercept = 0.95, col = "blue") +
  theme(legend.position = "bottom") +
  ggtitle("Coverage of approx. 95% credible intervals for biodiversity")

print(meanplt / (varplt + coverageplt) + plot_annotation('Holdout Biodiversity'))

In-Sample, marginal LV

obsnumbers <- detectednumspec(inputdata$insample$yXobs[, inputdata$species], 
                              inputdata$insample$yXobs[, "ModelSiteID", drop = TRUE])
Enum_det <- do.call(cbind, lapply(fittedmods, function(x)
  x$quality$insample$predspecnum["Esum_det_margpost", , drop = TRUE]))
Vnum_det <- do.call(cbind, lapply(fittedmods, function(x)
  x$quality$insample$predspecnum["Vsum_det_margpost", , drop = TRUE]))

Enum_compare(obsnumbers, Enum_det, Vnum_det)
meanplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>%
  as_tibble(rownames = "Model") %>%
  tidyr::pivot_longer(starts_with("SE"), names_to = "SEtype", values_to = "SE") %>%
  ggplot() +
  geom_vline(xintercept = 0, col = "blue") +
  geom_errorbarh(aes(xmin = `E[D]_obs` - 2 * SE, xmax = `E[D]_obs` + 2 * SE,
                   y = Model,
                   col = SEtype, lty = SEtype)) +
  geom_point(aes(x = `E[D]_obs`, y = Model)) +
  scale_x_continuous("Mean Residual") +
  ggtitle("Mean Residual of Number of Detected Spccies")

varplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>%
  as_tibble(rownames = "Model") %>%
  tidyr::pivot_longer(starts_with("V"), names_to = "Vtype", values_to = "Variance") %>%
  ggplot() +
  geom_point(aes(x = sqrt(Variance), y = Model, col = Vtype, shape = Vtype)) +
  scale_x_continuous("Standard Deviation of Residual") +
  theme(legend.position = "bottom") +
  ggtitle("Standard Deviation of Number of Detected Species")

coverageplt <- Enum_coverage(obsnumbers, Enum_det, Vnum_det)[["mean"]] %>%
  tibble::enframe(name = "Model", value = "Pcnt. Coverage") %>%
  ggplot() +
  geom_point(aes(x = `Pcnt. Coverage`, y = Model)) +
  geom_vline(xintercept = 0.95, col = "blue") +
  theme(legend.position = "bottom") +
  ggtitle("Coverage of approx. 95% credible intervals for biodiversity")

print(meanplt / (varplt + coverageplt) + plot_annotation('In Sample Biodiversity, marginal over LV distribution'))

Expected Number of Detections for Each Species

obsdetections <- colSums(fittedmods[[1]]$data$y)

Edetections_l <- lapply(fittedmods,
                      function(x){
                        EnV <- Endetect_modelsite(x,
                                            conditionalLV = (!is.null(x$data$nlv) && (x$data$nlv > 0) ))
                        Edet_sum <- colSums(EnV[["E_ndetect"]])
                        Vdet_sum <- colSums(EnV[["V_ndetect"]])
                        return(data.frame(Species = names(Edet_sum), Edet_sum = Edet_sum, Vdet_sum = Vdet_sum))
                      }
                      )
Edetections <- bind_rows(Edetections_l, .id = "Model")

Edetections %>% 
  as_tibble(Edetections) %>%
  ggplot() +
  geom_pointrange(aes(ymin = Edet_sum - 2 * sqrt(Vdet_sum),
                      y = Edet_sum, 
                      ymax = Edet_sum + 2 * sqrt(Vdet_sum),
                      x = Species,
                      col = Model),
                  inherit.aes = FALSE,
                  position = position_dodge(width = 1, preserve = "total"),
                  fatten = 1) +
  geom_col(aes(y = Detections, x = Species), data = obsdetections %>% enframe(name = "Species", value = "Detections"),
             alpha = 0.2,
             col = "black",
             fill = "black",
             inherit.aes = FALSE) +
  coord_flip() +
  # geom_point(aes(x = Edet_sum, y = Species, col = Model, shape = Model), inherit.aes = FALSE) +
  scale_color_viridis_d() +
  # scale_shape_manual(values = rep(0:5, 3)) +
  scale_y_continuous(name = "Expected Detections") +
  ggtitle("Expected Number of Detections vs Observed")

LV Values vs Covariates

pltbase <- cleanersummaries %>%
  filter(varname == "LV") %>%
  filter(varname == "LV", (Lower95 > 0) | (Upper95 < 0)) %>%
  inner_join(occcovar, by = "ModelSite") %>%
  tidyr::pivot_longer(c(names(occcovar), -ModelSite),
                      names_to = "OccCovariate",
                      values_to = "OccCovariateValue") %>%
  ggplot()

pltpoints <- geom_point(aes(x = OccCovariateValue, y = Median, shape = Covariate),
                        alpha = 0.2)

pltsmooth <- geom_smooth(aes(x = OccCovariateValue, y = Median, col = Covariate),
                                  data = function(x) x %>% dplyr::filter(!(OccCovariate %in% treatfactor)),
                                  method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs"))
pltsmooth_low95 <- geom_smooth(aes(x = OccCovariateValue, y = Lower95, col = Covariate),
                                  data = function(x) x %>% dplyr::filter(!(OccCovariate %in% treatfactor)),
                                  method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs"),
                                  lty = "dashed", lwd = 0.5)
pltsmooth_high95 <- geom_smooth(aes(x = OccCovariateValue, y = Upper95, col = Covariate),
                                  data = function(x) x %>% dplyr::filter(!(OccCovariate %in% treatfactor)),
                                  method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs"),
                                  lty = "dashed", lwd = 0.5)
pltfactor <- ggplot2::stat_summary(aes(x = OccCovariateValue, y = Median, col = Covariate),
                                  data = function(x) x %>% dplyr::filter(OccCovariate %in% treatfactor),
                                  geom = "pointrange",
                                  fun.data = mean_se,
                                  fun.args = list(mult = 2),
                                  position = position_dodge(width = 0.1, preserve = "total"),
                                  alpha = 0.8,
                                  lwd = 1.2,
                                  fatten = 1,
                                  show.legend = FALSE)
pltfactor_low95 <- ggplot2::stat_summary(aes(x = OccCovariateValue, y = Lower95, col = Covariate),
                                  data = function(x) x %>% dplyr::filter(OccCovariate %in% treatfactor),
                                  geom = "pointrange",
                                  fun.data = mean_se,
                                  fun.args = list(mult = 2),
                                  position = position_dodge(width = 0.1, preserve = "total"),
                                  alpha = 0.8,
                                  lwd = 0.2,
                                  fatten = 1,
                                  show.legend = FALSE)
pltfactor_high95 <- ggplot2::stat_summary(aes(x = OccCovariateValue, y = Upper95, col = Covariate),
                                  data = function(x) x %>% dplyr::filter(OccCovariate %in% treatfactor),
                                  geom = "pointrange",
                                  fun.data = mean_se,
                                  fun.args = list(mult = 2),
                                  position = position_dodge(width = 0.1, preserve = "total"),
                                  alpha = 0.8,
                                  lwd = 0.2,
                                  fatten = 1,
                                  show.legend = FALSE)


pltbase + 
  pltpoints +
  pltsmooth + pltfactor +
  pltsmooth_low95 + pltsmooth_high95 +
  pltfactor_low95 + pltfactor_high95 +
  facet_grid(rows = vars(Model),
             cols = vars(OccCovariate),
             scale = "free_x") +
  geom_hline(yintercept = 0, col = "blue", lty = "dashed") +
  theme(strip.text.x.top = element_text(angle = 90)) +
  scale_color_viridis_d(name = "Latent Variable") +
  scale_x_continuous(name = "Covariate Value") +
  ggtitle("Fitted Statistically Non-Zero LV Values")

Quick check of residual distribution for all models

Note that there are too many data points to apply the shapiro-Wilks normality test directly, however the Anderson-Darling test can be applied. I do not know enough about this test yet to know if it can be fully trusted.

resid_occ_l <- lapply(fittedmods, ds_occupancy_residuals.fit, type = "median", seed = 321, conditionalLV = FALSE)
vapply(resid_occ_l, function(x) goftest::ad.test(unlist(x[, -1]), null = "pnorm")$p.value, FUN.VALUE = 3.3)
as_tibble(lapply(resid_occ_l, function(x) unlist(x[, -1]))) %>%
  tidyr::pivot_longer(everything(), names_to = "Model", values_to = "Residual") %>%
  ggplot() +
  geom_qq(aes(sample = Residual), na.rm = TRUE) +
  geom_abline(slope = 1, intercept = 0, lty = "dashed") +
  facet_grid(cols = vars(Model)) +
  ggtitle("Occupancy Residual Distribution: qq plots")
resid_det_l <- lapply(fittedmods, ds_detection_residuals.fit, type = "median", seed = 321)
vapply(resid_det_l, function(x) {
  vals <- unlist(x[, -1])
  vals <- vals[!is.na(vals)]
  goftest::ad.test(vals, null = "pnorm")$p.value},
  FUN.VALUE = 3.3)
as_tibble(lapply(resid_det_l, function(x) unlist(x[, -1]))) %>%
  tidyr::pivot_longer(everything(), names_to = "Model", values_to = "Residual") %>%
  ggplot() +
  geom_qq(aes(sample = Residual), na.rm = TRUE) +
  geom_abline(slope = 1, intercept = 0, lty = "dashed") +
  facet_grid(cols = vars(Model)) +
  ggtitle("Detection Residual Distribution: qq plots")

Occupancy Residuals

# bytes <- vapply(ls(), function(objname) object.size(get(objname)), FUN.VALUE = 213)
# sort(bytes/1E9)
rm(resid_det_l)
rm(resid_occ_l)
seeds = c(321, 120)
df_l <- lapply(seeds, function(x) {
  resid_occ_l <- lapply(fittedmods,
                        function(fit) {
    resids <- ds_occupancy_residuals.fit(fit, type = "median", seed = x, conditionalLV = (!is.null(fit$nlv) && fit$nlv > 0))
    return(resids)})
  resid_occ_df <- bind_rows(resid_occ_l, .id = "Model")
  df <- resid_occ_df %>%
    tidyr::pivot_longer(-c(ModelSite, Model),
                   names_to = "Species",
                   values_to = "Residual",
                   values_drop_na = TRUE) %>%
    left_join(occcovar %>% 
                tidyr::pivot_longer(-ModelSite,
                   names_to = "Covariate",
                   values_to = "CovariateValue"),
              by = "ModelSite")
  return(df)
})
names(df_l) <- seeds
df <- bind_rows(df_l, .id = "seed")
rm(df_l)

pltbase <- df %>%
  ggplot()

pltpoints <- ggplot2::geom_point(aes(x = CovariateValue, y = Residual), data = function(x) x[x$seed == seeds[[1]], ])

# gam smooths for continuous variables
pltsmooth <- ggplot2::geom_smooth(aes(x = CovariateValue, y = Residual, col = seed),
                                  data = function(x) x %>% dplyr::filter(!(Covariate %in% treatfactor)),
                                  method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs"))

# mean + 2SE summaries for discrete variables
pltfactor <- ggplot2::stat_summary(aes(x = CovariateValue, y = Residual, col = seed),
                                  data = function(x) x %>% dplyr::filter(Covariate %in% treatfactor),
                                  geom = "pointrange",
                                  fun.data = mean_se,
                                  fun.args = list(mult = 2),
                                  position = position_dodge(width = 0.1, preserve = "total"),
                                  alpha = 0.8,
                                  lwd = 1.2,
                                  fatten = 1,
                                  show.legend = FALSE)

plt <- pltbase + pltsmooth + pltfactor + 
  ggplot2::facet_grid(rows = vars(Model), cols = vars(Covariate), as.table = TRUE, scales = "free_x") +
  ggplot2::geom_hline(yintercept = 0, col = "blue", lty = "dashed") +
  ggplot2::scale_x_continuous(name = "Covariate Value") +
  scale_y_continuous(name = "Occupancy Residual")
plt <- plt + coord_cartesian(ylim = c(-0.1, 0.1)) + ggtitle("Occupancy Residuals vs Covariates") +
  theme(strip.text.y.right = element_text(angle = 0))

print(plt)

rm(df)

Detection Residuals

seeds = c(321, 120)
df_l <- lapply(seeds, function(x) {
  resid_det_l <- lapply(fittedmods, function(fit) {
    resids <- ds_detection_residuals.fit(fit, type = "median", seed = x)
    return(resids)})
  resid_det_df <- bind_rows(resid_det_l, .id = "Model")
  df <- resid_det_df %>%
    tidyr::pivot_longer(-c(ModelSite, Model),
                   names_to = "Species",
                   values_to = "Residual",
                   values_drop_na = TRUE) %>%
    left_join(detcovar %>% 
                tidyr::pivot_longer(-ModelSite,
                   names_to = "Covariate",
                   values_to = "CovariateValue"),
              by = "ModelSite")
  return(df)
})
names(df_l) <- seeds
df_det <- bind_rows(df_l, .id = "seed")
rm(df_l)

pltbase <- df_det %>%
  ggplot() 

plt <- pltbase + pltsmooth + pltfactor + 
  ggplot2::facet_grid(rows = vars(Model), cols = vars(Covariate), as.table = TRUE, scales = "free_x") +
  ggplot2::geom_hline(yintercept = 0, col = "blue", lty = "dashed") +
  ggplot2::scale_x_continuous(name = "Covariate Value") +
  scale_y_continuous(name = "Detection Residual")
plt + coord_cartesian(ylim = c(-0.1, 0.1)) + ggtitle("Detection Residuals vs Covariates") +
  theme(strip.text.y.right = element_text(angle = 0))

rm(df_det)

Conclusions

Next Steps



sustainablefarms/msod documentation built on March 6, 2023, 7:17 a.m.