knitr::opts_chunk$set(echo = FALSE) library(tibble) library(dplyr) library(MCMCpack) library(mclust) library(corrplot) library(coda) library(runjags) library(ggplot2) library(patchwork) library(msod)
varname2type <- function(varnames){ types <- case_when( grepl("lv.coef", varnames) ~ "LV Load", grepl("LV", varnames) ~ "LV", grepl("^(mu|tau|sigma)", varnames) ~ "Comm Param", #parameters of community distributions grepl("^u.b", varnames) ~ "Occu Coef", grepl("^v.b", varnames) ~ "Detn Coef", TRUE ~ "other" ) return(types) }
modelspecs <- readRDS("./7_3_03_modelspecs.rds") filenames <- lapply(modelspecs, function(x) x$filename ) # test loading models a <- vapply(filenames, file.exists, FUN.VALUE = FALSE) stopifnot(all(a)) # load and remove crosscorrelation fittedmods <- lapply(filenames, function(x) { fit <- readRDS(x) return(fit)})
inputdata <- readRDS(readLines("../link_7_2_10_input_data.txt")[[1]]) # the following whittle out the covariates that we are interested in the residual diagnostics detcovar <- model.matrix(~ ModelSiteID + MeanWind + MeanTime + MeanClouds + MeanTemp + ObserverId - 1, data = inputdata$insampledata$yXobs) %>% as_tibble() %>% rename(ModelSite = ModelSiteID) occcovar <- model.matrix(as.formula(modelspecs[[1]]$OccFmla), data = inputdata$insampledata$Xocc) %>% as_tibble() %>% mutate(ModelSite = inputdata$insampledata$Xocc$ModelSiteID, StudyId = inputdata$insampledata$Xocc$StudyId, latitude = inputdata$insampledata$Xocc$latitude, SurveyYear = inputdata$insampledata$Xocc$SurveyYear) %>% dplyr::select(-`(Intercept)`) treatfactor_occ <- occcovar %>% summarise_all(~ n_distinct(.)) %>% tidyr::pivot_longer(everything(), names_to = "CovarName") %>% dplyr::filter(value <= 10) %>% dplyr::select(CovarName) %>% unlist() treatfactor_det <- detcovar %>% summarise_all(~ n_distinct(.)) %>% tidyr::pivot_longer(everything(), names_to = "CovarName") %>% dplyr::filter(value <= 10) %>% dplyr::select(CovarName) %>% unlist() treatfactor = c(treatfactor_occ, treatfactor_det)
lpds_l <- readRDS("./fittedmodels/7_3_03_lpds.rds") waics_l <- readRDS("./fittedmodels/7_3_03_waics.rds") Enums_holdout <- readRDS("./fittedmodels/7_3_03_many_Enum_holdout.rds") Enums_insample <- readRDS("./fittedmodels/7_3_03_many_Enum_insample_margLV.rds") Enums_insample_condLV <- readRDS("./fittedmodels/7_3_03_many_Enum_insample_condLV.rds") names(lpds_l) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(lpds_l)) names(waics_l) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(waics_l)) names(Enums_holdout) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(Enums_holdout)) names(Enums_insample) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(Enums_insample)) names(Enums_insample_condLV) <- gsub("someclimate_year_woody500m_msnm_det1stO", "model", names(Enums_insample_condLV))
cleanersummary <- function(fit) { vals <- fit$summaries %>% as_tibble(rownames = "bugsvarname") %>% rename_with(function(x) "AC_10", starts_with("AC")) %>% mutate(varname = gsub("\\[.*\\]", "", bugsvarname)) %>% mutate(type = varname2type(bugsvarname)) suppressWarnings(vals <- vals %>% mutate(SpeciesIdx = as.integer(gsub("(.*\\[|,.*\\])", "", bugsvarname))) %>% mutate(CovarIdx = as.integer(gsub("(.*\\[.*,|\\])", "", bugsvarname))) ) vals <- vals %>% mutate(Species = case_when( varname == "LV" ~ as.character(NA), TRUE ~ fit$species[SpeciesIdx])) %>% mutate(ModelSite = case_when( varname == "LV" ~ SpeciesIdx, TRUE ~ NA_integer_)) %>% mutate(Covariate = case_when( varname == "u.b" ~ colnames(fit$data$Xocc)[CovarIdx], varname == "v.b" ~ colnames(fit$data$Xobs)[CovarIdx], varname == "LV" ~ paste0("LV", as.character(CovarIdx)), varname == "lv.coef" ~ paste0("LV", as.character(CovarIdx)), TRUE ~ as.character(NA) )) %>% dplyr::select(-SpeciesIdx, -CovarIdx) return(vals) } cleanersummary_l <- lapply(fittedmods, cleanersummary) cleanersummaries <- bind_rows(cleanersummary_l, .id = "Model")
cat("MCMC time:\n") lapply(fittedmods, function(x) { if (!is.null(x$timetaken)) {return(runjags::timestring(as.numeric(x$timetaken, units="secs")))} else return(NULL)})
Comment on time taken for the MCMC of each model:
Values less than 1.1 are desired.
cleanersummaries %>% mutate(type = varname2type(varname)) %>% ggplot() + facet_grid(rows = vars(type), cols = vars(Model), scales = "free") + geom_point(aes(y = bugsvarname, x = psrf) ) + geom_vline(xintercept = 1.1, col = "blue", lty = "dashed") + scale_y_discrete(name = "Variable Name") + theme(strip.text.x.top = element_text(angle = 90)) + scale_x_continuous(name = "Gelman-Rubin Statistic")
Comment on convergence using the Gelman-Rubin statistic
Effective sample size should be similar order of magnitude to the number of MCMC samples.
goodthresh <- fittedmods[[1]]$sample * length(fittedmods[[1]]$mcmc) / 4 plt1 <- cleanersummaries %>% mutate(type = varname2type(varname)) %>% ggplot() + facet_grid(rows = vars(type), cols = vars(Model), scales = "free") + geom_point(aes(y = bugsvarname, x = SSeff, col = SSeff < goodthresh) ) + geom_vline(xintercept = goodthresh, col = "red", lty = "dashed") + scale_y_discrete(name = "Variable Name") + theme(strip.text.x.top = element_text(angle = 90)) + scale_x_continuous(name = "Effective Sample Size", trans = "log10") plt2 <- cleanersummaries %>% mutate(type = varname2type(varname)) %>% ggplot() + facet_grid(rows = vars(type), cols = vars(Model), scales = "free") + geom_histogram(aes(x = SSeff), bins = 50 ) + geom_vline(xintercept = goodthresh, col = "red", lty = "dashed") + scale_y_discrete(name = "Variable Name") + theme(strip.text.x.top = element_blank()) + scale_x_continuous(name = "Effective Sample Size", trans = "log10") plt1 / plt2
See http://www.ugrad.stat.ubc.ca/R/library/coda/html/geweke.diag.html for a very quick description of this. The Geweke values are Z-scores (technically t-distributed in this situation?). 95% of (independent) Geweke values should be within 2 standard deviations (i.e. just 2 for Z-scores) of the mean.
gwk <- bind_rows(lapply(fittedmods, function(x) enframe(geweke.diag(x, frac1=0.1, frac2=0.5)$z, name = "varname")), .id = "Model") plt1 <- gwk %>% mutate(type = varname2type(varname)) %>% ggplot() + facet_grid(rows = vars(type), cols = vars(Model), scales = "free") + geom_point(aes(y = varname, x = value, col = abs(value) < 2) ) + geom_vline(xintercept = c(-2, 2), col = "blue", lty = "dashed") + scale_y_discrete(name = "Variable Name") + theme(strip.text.x.top = element_text(angle = 90)) + scale_x_continuous(name = NULL) plt2 <- gwk %>% mutate(type = varname2type(varname)) %>% ggplot() + facet_grid(rows = vars(type), cols = vars(Model), scales = "free") + geom_histogram(aes(x = value), bins = 30 ) + geom_vline(xintercept = c(-2, 2), col = "blue", lty = "dashed") + scale_y_continuous(name = "Variable Name") + theme(strip.text.x.top = element_blank()) + scale_x_continuous(name = "Geweke Diagnostic Statistics") plt1 / plt2
Summarise behaviour of the MCMC chains
cleanersummaries %>% dplyr::filter(varname == "u.b") %>% dplyr::mutate(ci95_misses_0 = !((Lower95 < 0) & (Upper95 > 0))) %>% ggplot() + facet_wrap(vars(Covariate), nrow = 1) + geom_hline(yintercept = 0, col = "blue", lty = "dashed") + geom_pointrange(aes(x = Species, ymin = Lower95, ymax = Upper95, y = Median, col = Model, alpha = ci95_misses_0, group = Model), shape = "|", size = 1, position = position_dodge(width = 1)) + scale_y_continuous(name = "Posterior Estimate") + coord_flip() + scale_color_viridis_d() + theme(strip.text.x.top = element_text(angle = 90)) + ggtitle("Occupancy Covariate Loadings per Species and Model")
cleanersummaries %>% dplyr::filter(varname == "v.b") %>% dplyr::mutate(ci95_misses_0 = !((Lower95 < 0) & (Upper95 > 0))) %>% ggplot() + facet_wrap(vars(Covariate), nrow = 1) + geom_hline(yintercept = 0, col = "blue", lty = "dashed") + geom_pointrange(aes(x = Species, ymin = Lower95, ymax = Upper95, y = Median, col = Model, alpha = ci95_misses_0, group = Model), shape = "|", size = 1, position = position_dodge(width = 1)) + scale_y_continuous(name = "Posterior Estimate") + coord_flip() + scale_color_viridis_d() + theme(strip.text.x.top = element_text(angle = 90)) + ggtitle("Detection Covariate Loadings per Species and Model")
cleanersummaries %>% dplyr::filter(varname == "lv.coef") %>% ggplot() + facet_wrap(vars(Covariate), nrow = 1) + geom_hline(yintercept = 0) + geom_pointrange(aes(x = Species, ymin = Lower95, ymax = Upper95, y = Median, col = Model, group = Model), shape = "|", size = 1, position = position_dodge(width = 1)) + coord_flip() + scale_color_viridis_d() + theme(strip.text.x.top = element_text(angle = 90)) + ggtitle("LV Loadings per Species and Model")
lpds_df <- as_tibble(do.call(cbind, lapply(fittedmods, function(x) x$quality$holdout$lpd$lpds))) melpd <- data.frame(Estimate = apply(lpds_df, 2, mean), SE = apply(lpds_df, 2, sd) / sqrt(nrow(lpds_df))) lpds_df %>% as_tibble() %>% rowid_to_column(var = "HoldOutModelSite") %>% tidyr::pivot_longer(-HoldOutModelSite, names_to = "model", values_to = "Site_lpd") %>% ggplot() + # facet_grid(rows = vars(model), scales = "free_y") + geom_violin(aes(x = Site_lpd, y = model), adjust = 0.2) + stat_summary(aes(x = Site_lpd, y = model), fun.data=mean_se, fun.args = list(mult=2), geom="crossbar", width=0.2 ) + stat_summary(aes(x = Site_lpd, y = model), fun = median, geom = "point", shape = 23, size = 2) + ggtitle("Log-Posterior Density of Holdout ModelSites")
Below is the lpd computed for each ModelSite for both the InSample and out of sample data.
insamplelpds_df <- do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, "elpd_waic"])) %>% as_tibble() %>% mutate(InSample = TRUE) df <- lpds_df %>% as_tibble() %>% mutate(InSample = FALSE) # %>% tidyr::pivot_longer(-InSample, names_to = "model", values_to = "site_lpd") df <- bind_rows(insamplelpds_df, df) df %>% as_tibble() %>% tidyr::pivot_longer(-InSample, names_to = "model", values_to = "site_lpd") %>% ggplot() + facet_grid(rows = vars(model), scales = "free_y", switch = "y") + geom_violin(aes(x = site_lpd, y = InSample, col = InSample), adjust = 0.2, position = position_dodge(1)) + stat_summary(aes(x = site_lpd, y = InSample, group = InSample), fun.data=mean_se, fun.args = list(mult=2), geom="crossbar", width=0.2 ) + theme(strip.text.y.left = element_text(angle = 0)) + ggtitle("Log Posterior Density of ModelSites")
waics_average_elpd_per_point <- lapply(fittedmods, function(x) x$quality$insample$waic$estimates["elpd_waic", ]/nrow(x$quality$insample$waic$pointwise)) waics_average_elpd_per_point <- simplify2array(waics_average_elpd_per_point) waics_average_elpd_per_point <- t(waics_average_elpd_per_point) %>% as_tibble(rownames = "Model") waics_average_elpd_per_point$type = "WAIC" loo_average_elpd_per_point <- lapply(fittedmods, function(x) x$quality$insample$loo$estimates["elpd_loo", ]/nrow(x$quality$insample$loo$pointwise)) loo_average_elpd_per_point <- simplify2array(loo_average_elpd_per_point) loo_average_elpd_per_point <- t(loo_average_elpd_per_point) %>% as_tibble(rownames = "Model") loo_average_elpd_per_point$type = "LOO-PSIS" melpd$type <- "Holdout" melpd <- as_tibble(melpd, rownames = "Model") df <- bind_rows(waics_average_elpd_per_point, loo_average_elpd_per_point, melpd) df %>% ggplot() + geom_errorbar(aes(x = Model, ymax = Estimate + 2*SE, ymin = Estimate - 2*SE, col = type, lty = type)) + geom_point(aes(x = Model, y = Estimate, col = type), position = position_jitter(width = 0.1)) + coord_flip() + ggtitle("Estimates of Log Posterior Density for a new ModelSite")
ModelSites which diagnositics suggest have a bad WAIC estimate or loo-psis estimate are investigated below:
lpds_df_diagnose <- bind_rows(lapply(fittedmods, function(x) { waicsvals <- as_tibble(x$quality$insample$waic$pointwise[, c("elpd_waic", "p_waic")]) loovals <- as_tibble(x$quality$insample$loo$pointwise[, c("elpd_loo", "p_loo")]) loodiagnose <- as_tibble(x$quality$insample$loo$diagnostics) return(cbind(ModelSite = 1:nrow(waicsvals), waicsvals, loovals, loodiagnose)) }), .id = "model") lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% mutate(is_ok = factor(is_ok)) %>% group_by(model) %>% count(is_ok) %>% tidyr::pivot_wider(names_from = is_ok, values_from = n) lpds_df_diagnose %>% ggplot() + facet_grid(rows = vars(model), scales = "free_y", switch = NULL) + geom_point(aes(x = elpd_waic, y = p_waic, col = p_waic < 0.4)) + geom_hline(yintercept = 0.4, col = "blue") + #see [1]A. Vehtari, A. Gelman, and J. Gabry, "Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC," Stat Comput, vol. 27, pp. 1413-1432, Sep. 2017 # geom_rug(aes(x = elpd_waic), # data = function(x) dplyr::filter(x, p_waic > 0.4), # sides = "t") + theme(strip.text.y.right = element_text(angle = 0)) + scale_y_continuous(name = "p_waic", trans = "log") + ggtitle("Diagnostics of WAIC") lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% ggplot() + facet_grid(rows = vars(model), switch = NULL) + geom_point(aes(x = ModelSite, y = pareto_k, col = is_ok)) + geom_hline(yintercept = 0.5, col = "blue") + #see [1]A. Vehtari, A. Gelman, and J. Gabry, "Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC," Stat Comput, vol. 27, pp. 1413-1432, Sep. 2017 geom_hline(yintercept = 0.7, col = "blue") + #see [1]A. Vehtari, A. Gelman, and J. Gabry, "Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC," Stat Comput, vol. 27, pp. 1413-1432, Sep. 2017 theme(strip.text.y.right = element_text(angle = 0)) + ggtitle("Diagnostics of LOO-PSIS: Pareto_k for ModelSites") lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% dplyr::filter(is_ok == "bad") %>% ggplot() + facet_grid(rows = vars(model), switch = NULL) + geom_point(aes(x = p_loo, y = pareto_k)) + theme(strip.text.y.right = element_text(angle = 0)) + ggtitle("Diagnostics of LOO-PSIS: p_loo of pareto_k for ModelSites") lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% dplyr::filter(is_ok == "bad") %>% # inner_join(detcovar, by = "ModelSite") %>% inner_join(occcovar, by = "ModelSite") %>% dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>% tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>% ggplot() + facet_wrap(vars(Covariate), scales = "free") + theme(strip.text.y.right = element_text(angle = 0)) + geom_point(aes(x = CovariateValue, y = factor(ModelSite), col = model), shape = "+", position = "jitter") + scale_x_continuous(trans = scales::modulus_trans(0)) lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% dplyr::filter(is_ok == "bad") %>% inner_join(detcovar, by = "ModelSite") %>% # inner_join(occcovar, by = "ModelSite") %>% dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>% tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>% ggplot() + facet_wrap(vars(Covariate), scales = "free") + theme(strip.text.y.right = element_text(angle = 0)) + geom_point(aes(x = CovariateValue, y = factor(ModelSite), col = model), shape = "+", position = "jitter") + scale_x_continuous(trans = scales::modulus_trans(0)) lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% dplyr::filter(is_ok == "bad") %>% # inner_join(detcovar, by = "ModelSite") %>% inner_join(occcovar, by = "ModelSite") %>% dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>% tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>% ggplot() + facet_wrap(vars(Covariate), scales = "free") + theme(strip.text.y.right = element_text(angle = 0)) + geom_freqpoly(aes(x = CovariateValue, y = ..density.., lty = model, col = model), position = "stack") + geom_freqpoly(aes(x = CovariateValue, y = ..density..), data = tidyr::pivot_longer(occcovar, c(-ModelSite), names_to = "Covariate", values_to = "CovariateValue"), col = "black") + scale_x_continuous(trans = scales::modulus_trans(0)) lpds_df_diagnose %>% mutate(is_ok = case_when( pareto_k < 0.5 ~ "good", pareto_k < 0.7 ~ "ok", pareto_k >= 0.7 ~ "bad" )) %>% dplyr::filter(is_ok == "bad") %>% inner_join(detcovar, by = "ModelSite") %>% dplyr::select(-elpd_waic, -p_waic, -elpd_loo, -n_eff) %>% tidyr::pivot_longer(c(-model, -ModelSite, -p_loo, -pareto_k, -is_ok), names_to = "Covariate", values_to = "CovariateValue") %>% ggplot() + facet_wrap(vars(Covariate), scales = "free") + theme(strip.text.y.right = element_text(angle = 0)) + geom_freqpoly(aes(x = CovariateValue, y = ..density.., lty = model, col = model), position = "stack") + geom_freqpoly(aes(x = CovariateValue, y = ..density..), data = tidyr::pivot_longer(detcovar, c(-ModelSite), names_to = "Covariate", values_to = "CovariateValue"), col = "black") + scale_x_continuous(trans = scales::modulus_trans(0))
According to help page 'loo-glossary', if p_loo << [total parameters in the model], then the model is likely misspecified.
elpd_compare(lpds_df) elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, 1]))) elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$loo$pointwise[, 1]))) plot_compare_loo <- function(compare.loo.obj){ plt <- compare.loo.obj %>% as_tibble(rownames = "Model") %>% ggplot() + geom_errorbarh(aes(xmin = elpd_diff - 2 * se_diff, xmax = elpd_diff + 2 * se_diff, y = Model)) + geom_point(aes(x = elpd_diff, y = Model)) + geom_vline(xintercept = 0, col = "blue") + scale_x_continuous("Pointwise Difference in Expected Log Posterior Density") return(plt) } elpd_compare(lpds_df) %>% plot_compare_loo() + ggtitle("Holdout elpd Differences") elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, 1]))) %>% plot_compare_loo() + ggtitle("WAICS Differences") elpd_compare(elpds = do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$waic$pointwise[, 1]))) %>% plot_compare_loo() + ggtitle("LOO-PSIS Differences")
obsnumbers <- detectednumspec(inputdata$holdoutdata$yXobs[, inputdata$species], inputdata$holdoutdata$yXobs[, "ModelSiteID", drop = TRUE]) Enum_det <- do.call(cbind, lapply(fittedmods, function(x) x$quality$holdout$predspecnum["Esum_det", , drop = TRUE])) Vnum_det <- do.call(cbind, lapply(fittedmods, function(x) x$quality$holdout$predspecnum["Vsum_det", , drop = TRUE])) Enum_compare(obsnumbers, Enum_det, Vnum_det) meanplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>% as_tibble(rownames = "Model") %>% tidyr::pivot_longer(starts_with("SE"), names_to = "SEtype", values_to = "SE") %>% ggplot() + geom_vline(xintercept = 0, col = "blue") + geom_errorbarh(aes(xmin = `E[D]_obs` - 2 * SE, xmax = `E[D]_obs` + 2 * SE, y = Model, col = SEtype, lty = SEtype)) + geom_point(aes(x = `E[D]_obs`, y = Model)) + scale_x_continuous("Mean Residual") + ggtitle("Mean Residual of Number of Detected Spccies") varplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>% as_tibble(rownames = "Model") %>% tidyr::pivot_longer(starts_with("V"), names_to = "Vtype", values_to = "Variance") %>% ggplot() + geom_point(aes(x = sqrt(Variance), y = Model, col = Vtype, shape = Vtype)) + scale_x_continuous("Standard Deviation of Residual") + theme(legend.position = "bottom") + ggtitle("Standard Deviation of Number of Detected Species") coverageplt <- Enum_coverage(obsnumbers, Enum_det, Vnum_det)[["mean"]] %>% tibble::enframe(name = "Model", value = "Pcnt. Coverage") %>% ggplot() + geom_point(aes(x = `Pcnt. Coverage`, y = Model)) + geom_vline(xintercept = 0.95, col = "blue") + theme(legend.position = "bottom") + ggtitle("Coverage of approx. 95% credible intervals for biodiversity") print(meanplt / (varplt + coverageplt) + plot_annotation('Holdout Biodiversity'))
obsnumbers <- detectednumspec(inputdata$insample$yXobs[, inputdata$species], inputdata$insample$yXobs[, "ModelSiteID", drop = TRUE]) Enum_det <- do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$predspecnum["Esum_det_margpost", , drop = TRUE])) Vnum_det <- do.call(cbind, lapply(fittedmods, function(x) x$quality$insample$predspecnum["Vsum_det_margpost", , drop = TRUE])) Enum_compare(obsnumbers, Enum_det, Vnum_det) meanplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>% as_tibble(rownames = "Model") %>% tidyr::pivot_longer(starts_with("SE"), names_to = "SEtype", values_to = "SE") %>% ggplot() + geom_vline(xintercept = 0, col = "blue") + geom_errorbarh(aes(xmin = `E[D]_obs` - 2 * SE, xmax = `E[D]_obs` + 2 * SE, y = Model, col = SEtype, lty = SEtype)) + geom_point(aes(x = `E[D]_obs`, y = Model)) + scale_x_continuous("Mean Residual") + ggtitle("Mean Residual of Number of Detected Spccies") varplt <- Enum_compare(obsnumbers, Enum_det, Vnum_det) %>% as_tibble(rownames = "Model") %>% tidyr::pivot_longer(starts_with("V"), names_to = "Vtype", values_to = "Variance") %>% ggplot() + geom_point(aes(x = sqrt(Variance), y = Model, col = Vtype, shape = Vtype)) + scale_x_continuous("Standard Deviation of Residual") + theme(legend.position = "bottom") + ggtitle("Standard Deviation of Number of Detected Species") coverageplt <- Enum_coverage(obsnumbers, Enum_det, Vnum_det)[["mean"]] %>% tibble::enframe(name = "Model", value = "Pcnt. Coverage") %>% ggplot() + geom_point(aes(x = `Pcnt. Coverage`, y = Model)) + geom_vline(xintercept = 0.95, col = "blue") + theme(legend.position = "bottom") + ggtitle("Coverage of approx. 95% credible intervals for biodiversity") print(meanplt / (varplt + coverageplt) + plot_annotation('In Sample Biodiversity, marginal over LV distribution'))
obsdetections <- colSums(fittedmods[[1]]$data$y) Edetections_l <- lapply(fittedmods, function(x){ EnV <- Endetect_modelsite(x, conditionalLV = (!is.null(x$data$nlv) && (x$data$nlv > 0) )) Edet_sum <- colSums(EnV[["E_ndetect"]]) Vdet_sum <- colSums(EnV[["V_ndetect"]]) return(data.frame(Species = names(Edet_sum), Edet_sum = Edet_sum, Vdet_sum = Vdet_sum)) } ) Edetections <- bind_rows(Edetections_l, .id = "Model") Edetections %>% as_tibble(Edetections) %>% ggplot() + geom_pointrange(aes(ymin = Edet_sum - 2 * sqrt(Vdet_sum), y = Edet_sum, ymax = Edet_sum + 2 * sqrt(Vdet_sum), x = Species, col = Model), inherit.aes = FALSE, position = position_dodge(width = 1, preserve = "total"), fatten = 1) + geom_col(aes(y = Detections, x = Species), data = obsdetections %>% enframe(name = "Species", value = "Detections"), alpha = 0.2, col = "black", fill = "black", inherit.aes = FALSE) + coord_flip() + # geom_point(aes(x = Edet_sum, y = Species, col = Model, shape = Model), inherit.aes = FALSE) + scale_color_viridis_d() + # scale_shape_manual(values = rep(0:5, 3)) + scale_y_continuous(name = "Expected Detections") + ggtitle("Expected Number of Detections vs Observed")
pltbase <- cleanersummaries %>% filter(varname == "LV") %>% filter(varname == "LV", (Lower95 > 0) | (Upper95 < 0)) %>% inner_join(occcovar, by = "ModelSite") %>% tidyr::pivot_longer(c(names(occcovar), -ModelSite), names_to = "OccCovariate", values_to = "OccCovariateValue") %>% ggplot() pltpoints <- geom_point(aes(x = OccCovariateValue, y = Median, shape = Covariate), alpha = 0.2) pltsmooth <- geom_smooth(aes(x = OccCovariateValue, y = Median, col = Covariate), data = function(x) x %>% dplyr::filter(!(OccCovariate %in% treatfactor)), method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs")) pltsmooth_low95 <- geom_smooth(aes(x = OccCovariateValue, y = Lower95, col = Covariate), data = function(x) x %>% dplyr::filter(!(OccCovariate %in% treatfactor)), method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs"), lty = "dashed", lwd = 0.5) pltsmooth_high95 <- geom_smooth(aes(x = OccCovariateValue, y = Upper95, col = Covariate), data = function(x) x %>% dplyr::filter(!(OccCovariate %in% treatfactor)), method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs"), lty = "dashed", lwd = 0.5) pltfactor <- ggplot2::stat_summary(aes(x = OccCovariateValue, y = Median, col = Covariate), data = function(x) x %>% dplyr::filter(OccCovariate %in% treatfactor), geom = "pointrange", fun.data = mean_se, fun.args = list(mult = 2), position = position_dodge(width = 0.1, preserve = "total"), alpha = 0.8, lwd = 1.2, fatten = 1, show.legend = FALSE) pltfactor_low95 <- ggplot2::stat_summary(aes(x = OccCovariateValue, y = Lower95, col = Covariate), data = function(x) x %>% dplyr::filter(OccCovariate %in% treatfactor), geom = "pointrange", fun.data = mean_se, fun.args = list(mult = 2), position = position_dodge(width = 0.1, preserve = "total"), alpha = 0.8, lwd = 0.2, fatten = 1, show.legend = FALSE) pltfactor_high95 <- ggplot2::stat_summary(aes(x = OccCovariateValue, y = Upper95, col = Covariate), data = function(x) x %>% dplyr::filter(OccCovariate %in% treatfactor), geom = "pointrange", fun.data = mean_se, fun.args = list(mult = 2), position = position_dodge(width = 0.1, preserve = "total"), alpha = 0.8, lwd = 0.2, fatten = 1, show.legend = FALSE) pltbase + pltpoints + pltsmooth + pltfactor + pltsmooth_low95 + pltsmooth_high95 + pltfactor_low95 + pltfactor_high95 + facet_grid(rows = vars(Model), cols = vars(OccCovariate), scale = "free_x") + geom_hline(yintercept = 0, col = "blue", lty = "dashed") + theme(strip.text.x.top = element_text(angle = 90)) + scale_color_viridis_d(name = "Latent Variable") + scale_x_continuous(name = "Covariate Value") + ggtitle("Fitted Statistically Non-Zero LV Values")
Note that there are too many data points to apply the shapiro-Wilks normality test directly, however the Anderson-Darling test can be applied. I do not know enough about this test yet to know if it can be fully trusted.
resid_occ_l <- lapply(fittedmods, ds_occupancy_residuals.fit, type = "median", seed = 321, conditionalLV = FALSE) vapply(resid_occ_l, function(x) goftest::ad.test(unlist(x[, -1]), null = "pnorm")$p.value, FUN.VALUE = 3.3) as_tibble(lapply(resid_occ_l, function(x) unlist(x[, -1]))) %>% tidyr::pivot_longer(everything(), names_to = "Model", values_to = "Residual") %>% ggplot() + geom_qq(aes(sample = Residual), na.rm = TRUE) + geom_abline(slope = 1, intercept = 0, lty = "dashed") + facet_grid(cols = vars(Model)) + ggtitle("Occupancy Residual Distribution: qq plots")
resid_det_l <- lapply(fittedmods, ds_detection_residuals.fit, type = "median", seed = 321) vapply(resid_det_l, function(x) { vals <- unlist(x[, -1]) vals <- vals[!is.na(vals)] goftest::ad.test(vals, null = "pnorm")$p.value}, FUN.VALUE = 3.3) as_tibble(lapply(resid_det_l, function(x) unlist(x[, -1]))) %>% tidyr::pivot_longer(everything(), names_to = "Model", values_to = "Residual") %>% ggplot() + geom_qq(aes(sample = Residual), na.rm = TRUE) + geom_abline(slope = 1, intercept = 0, lty = "dashed") + facet_grid(cols = vars(Model)) + ggtitle("Detection Residual Distribution: qq plots")
# bytes <- vapply(ls(), function(objname) object.size(get(objname)), FUN.VALUE = 213) # sort(bytes/1E9) rm(resid_det_l) rm(resid_occ_l)
seeds = c(321, 120) df_l <- lapply(seeds, function(x) { resid_occ_l <- lapply(fittedmods, function(fit) { resids <- ds_occupancy_residuals.fit(fit, type = "median", seed = x, conditionalLV = (!is.null(fit$nlv) && fit$nlv > 0)) return(resids)}) resid_occ_df <- bind_rows(resid_occ_l, .id = "Model") df <- resid_occ_df %>% tidyr::pivot_longer(-c(ModelSite, Model), names_to = "Species", values_to = "Residual", values_drop_na = TRUE) %>% left_join(occcovar %>% tidyr::pivot_longer(-ModelSite, names_to = "Covariate", values_to = "CovariateValue"), by = "ModelSite") return(df) }) names(df_l) <- seeds df <- bind_rows(df_l, .id = "seed") rm(df_l) pltbase <- df %>% ggplot() pltpoints <- ggplot2::geom_point(aes(x = CovariateValue, y = Residual), data = function(x) x[x$seed == seeds[[1]], ]) # gam smooths for continuous variables pltsmooth <- ggplot2::geom_smooth(aes(x = CovariateValue, y = Residual, col = seed), data = function(x) x %>% dplyr::filter(!(Covariate %in% treatfactor)), method = "gam", level = 0.95, formula = y ~ s(x, bs = "cs")) # mean + 2SE summaries for discrete variables pltfactor <- ggplot2::stat_summary(aes(x = CovariateValue, y = Residual, col = seed), data = function(x) x %>% dplyr::filter(Covariate %in% treatfactor), geom = "pointrange", fun.data = mean_se, fun.args = list(mult = 2), position = position_dodge(width = 0.1, preserve = "total"), alpha = 0.8, lwd = 1.2, fatten = 1, show.legend = FALSE) plt <- pltbase + pltsmooth + pltfactor + ggplot2::facet_grid(rows = vars(Model), cols = vars(Covariate), as.table = TRUE, scales = "free_x") + ggplot2::geom_hline(yintercept = 0, col = "blue", lty = "dashed") + ggplot2::scale_x_continuous(name = "Covariate Value") + scale_y_continuous(name = "Occupancy Residual") plt <- plt + coord_cartesian(ylim = c(-0.1, 0.1)) + ggtitle("Occupancy Residuals vs Covariates") + theme(strip.text.y.right = element_text(angle = 0)) print(plt) rm(df)
seeds = c(321, 120) df_l <- lapply(seeds, function(x) { resid_det_l <- lapply(fittedmods, function(fit) { resids <- ds_detection_residuals.fit(fit, type = "median", seed = x) return(resids)}) resid_det_df <- bind_rows(resid_det_l, .id = "Model") df <- resid_det_df %>% tidyr::pivot_longer(-c(ModelSite, Model), names_to = "Species", values_to = "Residual", values_drop_na = TRUE) %>% left_join(detcovar %>% tidyr::pivot_longer(-ModelSite, names_to = "Covariate", values_to = "CovariateValue"), by = "ModelSite") return(df) }) names(df_l) <- seeds df_det <- bind_rows(df_l, .id = "seed") rm(df_l) pltbase <- df_det %>% ggplot() plt <- pltbase + pltsmooth + pltfactor + ggplot2::facet_grid(rows = vars(Model), cols = vars(Covariate), as.table = TRUE, scales = "free_x") + ggplot2::geom_hline(yintercept = 0, col = "blue", lty = "dashed") + ggplot2::scale_x_continuous(name = "Covariate Value") + scale_y_continuous(name = "Detection Residual") plt + coord_cartesian(ylim = c(-0.1, 0.1)) + ggtitle("Detection Residuals vs Covariates") + theme(strip.text.y.right = element_text(angle = 0)) rm(df_det)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.