tests/testthat/test-expectedbiodiversity.R

# test expected number of species

context("Number of Detected Species Expected")

# tests:
# multiple different draws*****
# model has LVs, use fitted LVs, in sample data
# model has LVs, marginalise LVs, in sample data
# model has no LVs, in sample data

# model has LVs, marginalise LVs, holdout data
# model has no LVs, holdout data

test_that("In sample data; fitted LV values; different draws", {
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 1, nlv = 4)
  
  # make a new, different, second parameter set
  artfit$mcmc[[2]] <- artfit$mcmc[[1]]
  artfit$sample <- 1
  bugvarnames <- names(artfit$mcmc[[1]][1, ])
  artfit$mcmc[[2]][1, grepl("^u.b\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^u.b\\[.*", bugvarnames)] * runif(3, min = 5, max = 10)
  artfit$mcmc[[2]][1, grepl("^v.b\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^v.b\\[.*", bugvarnames)] * runif(3, min = 5, max = 10)
  artfit$mcmc[[2]][1, grepl("^lv.coef\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^lv.coef\\[.*", bugvarnames)] * runif(3, min = 0.1, max = 0.2)
  artfit$mcmc[[2]][1, grepl("^LV\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^LV\\[.*", bugvarnames)] * runif(4, min = 0.5, max = 1)
  
  # Predicted number of species detected and in occupation
  numspec <- predsumspecies(artfit, UseFittedLV = TRUE)
  meanvar <- cumsum(numspec["Vsum_det_median", ])/((1:ncol(numspec))^2)
  sd_final <- sqrt(meanvar[ncol(numspec)])
  expect_equal(ncol(numspec), nsites)
  
  # Anticipate the Enumspec is wrong when using both draws, as simulated data in artfit is from the first draw
  NumSpecies_1st <- detectednumspec(y = artfit$data$y, ModelSite = artfit$data$ModelSite)
  
  meandiff_1st <- dplyr::cummean(NumSpecies_1st - numspec["Esum_det_median", ])
  plt <- cbind(diff = meandiff_1st, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  expect_gt(abs(meandiff_1st[ncol(numspec)]), 3 * sd_final)
  
  # Anticipate the Enumspec is correct when using only first draw (chain), as simulated data in artfit is from the first draw
  Enumspec_1stonly <- predsumspecies(artfit, chain = 1, UseFittedLV = TRUE)
  meanvar_1stonly <- cumsum(Enumspec_1stonly["Vsum_det_median", ])/((1:ncol(Enumspec_1stonly))^2)
  sd_final_1st <- sqrt(meanvar[ncol(Enumspec_1stonly)])
  
  meandiff_1st <- dplyr::cummean(NumSpecies_1st - Enumspec_1stonly["Esum_det_median", ])
  plt <- cbind(diff = meandiff_1st, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  expect_lt(abs(meandiff_1st[ncol(Enumspec_1stonly)]), 3 * sd_final_1st)
  
  # Anticipate that it is correct for 2nd draw separated from the 1st draw
  y_2nd <- simulate_fit(artfit, esttype = 2, UseFittedLV = TRUE)
  NumSpecies_2nd <- detectednumspec(y = y_2nd, ModelSite = artfit$data$ModelSite)
  Enumspec_2ndonly <- predsumspecies(artfit, chain = 2, UseFittedLV = TRUE)
  meanvar_2ndonly <- cumsum(Enumspec_2ndonly["Vsum_det_median", ])/((1:ncol(Enumspec_2ndonly))^2)
  sd_final_2nd <- sqrt(meanvar[ncol(Enumspec_2ndonly)])
  meandiff_2nd <- dplyr::cummean(NumSpecies_2nd - Enumspec_2ndonly["Esum_det_median", ])
  plt <- cbind(diff = meandiff_2nd, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  expect_lt(abs(meandiff_2nd[ncol(Enumspec_2ndonly)]), 3 * sd_final_2nd)
  
  
  # Anticipate prediction from combined draw is similar when occupancy + detection simulated with parameters chosen with equal chance from artfit$mcmc[[1]]
  # But anticipate within-model uncertainty of median parameter values does not cover observed values.
  ## combine observations for model sites to simulate the equal credence on each parameter set
  NumSpecies_interleaved <- NumSpecies_1st
  drawselect <- as.logical(rbinom(1000, size = 1, prob = 0.5))
  NumSpecies_interleaved[drawselect] <- NumSpecies_2nd[drawselect]
  
  meandiff <- dplyr::cummean(NumSpecies_interleaved - numspec["Esum_det_margpost", ])
  meanvar <- cumsum(numspec["Vsum_det_margpost", ])/((1:ncol(numspec))^2)
  plt <- cbind(diff = meandiff, var = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  sd_final <- sqrt(meanvar[ncol(numspec)])
  expect_lt(abs(meandiff[ncol(numspec)]), 3 * sd_final)
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  ininterval_marg <- (NumSpecies_interleaved > numspec["Esum_det_margpost", ] - 2 * sqrt(numspec["Vsum_det_margpost", ])) & 
    (NumSpecies_interleaved < numspec["Esum_det_margpost", ] + 2 * sqrt(numspec["Vsum_det_margpost", ]))
  expect_equal(mean(ininterval_marg), 0.95, tol = 0.05)
  
  # and that within-model median parameters *do not*
  ininterval_median <- (NumSpecies_interleaved > numspec["Esum_det_median", ] - 2 * sqrt(numspec["Vsum_det_median", ])) & 
    (NumSpecies_interleaved < numspec["Esum_det_median", ] + 2 * sqrt(numspec["Vsum_det_median", ]))
  expect_lt(mean(ininterval_median), 0.90)
})

test_that("In sample data; fitted LV values", {
  nsites <- 10000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 3, nlv = 4)
  artfit$mcmc[[1]] <- rbind(artfit$mcmc[[1]][1, ], artfit$mcmc[[1]][1, ])
  
  numspec <- predsumspecies(artfit, UseFittedLV = TRUE, type = "median")
  expect_equal(ncol(numspec), nsites)
  
  NumSpecies <- detectednumspec(y = artfit$data$y, ModelSite = artfit$data$ModelSite)
  
  # median of theta should be correct
  meandiff <- dplyr::cummean(NumSpecies - numspec["Esum_det_median", ])
  meanvar <- cumsum(numspec["Vsum_det_median", ])/((1:ncol(numspec))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error once the software is computed
  sd_final <- sqrt(meanvar[ncol(numspec)])
  expect_equal(meandiff[ncol(numspec)], 0, tol = 3 * sd_final)
  
  # difference between expected and observed should be zero on average; check that is getting closer with increasing data
  expect_lt(abs(meandiff[length(meandiff)]), max(abs(meandiff[floor(length(meandiff) / 20) + 1:20 ])))
  
  Enum_compare_sum <- Enum_compare(NumSpecies,
               as.matrix(numspec["Esum_det_median", ], ncol = 1),
               as.matrix(numspec["Vsum_det_median", ], ncol = 1)
               )
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_model"]])
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_obs"]])
  expect_equivalent(Enum_compare_sum[["V[D]_model"]], Enum_compare_sum[["V[D]_obs"]], tol = 0.05 * Enum_compare_sum[["V[D]_obs"]])
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  ininterval <- (NumSpecies > numspec["Esum_det_margpost", ] - 2 * sqrt(numspec["Vsum_det_margpost", ])) & 
    (NumSpecies < numspec["Esum_det_margpost", ] + 2 * sqrt(numspec["Vsum_det_margpost", ]))
  expect_equal(mean(ininterval), 0.95, tol = 0.05)
  
  plt <- cbind(NumSpecies = NumSpecies, pred = numspec["Esum_det_margpost", ], se = sqrt(numspec["Vsum_det_margpost", ])) %>% 
    dplyr::as_tibble() %>% 
    dplyr::mutate(resid = NumSpecies - pred) %>%
    dplyr::arrange(resid) %>%
    tibble::rowid_to_column(var = "rowid") %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= rowid, ymin = -2 * se, ymax = + 2 * se), fill = "grey") +
    ggplot2::geom_point(ggplot2::aes(x = rowid, y = NumSpecies - pred), col = "blue", lwd = 2)
  # print(plt)
})

test_that("In sample data; marginal on LV values", {
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 3, nlv = 4,
                               u.b.min = -0.01,
                               u.b.max = 0.01,
                               v.b.min = -0.01,
                               v.b.max = 0.01,
                               lv.coef.min = 0.4,
                               lv.coef.max = 0.5 #hopefully LVs have a much bigger effect than occupancy etc
                               )
  artfit$mcmc[[1]] <- rbind(artfit$mcmc[[1]][1, ], artfit$mcmc[[1]][1, ])
  
  numspec <- predsumspecies(artfit, UseFittedLV = FALSE, nLVsim = 1000, type = "median")
  expect_equal(ncol(numspec), nsites)
  
  NumSpecies <- detectednumspec(y = artfit$data$y, ModelSite = artfit$data$ModelSite)
  
  # the median should be perfectly correct except that it ignores the fitted LV values aren't used
  meandiff <- dplyr::cummean(NumSpecies - numspec["Esum_det_median", ])
  meanvar <- cumsum(numspec["Vsum_det_median", ])/((1:ncol(numspec))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error: should be larger than bound due to misuse of LV
  sd_half <- sqrt(meanvar[floor(ncol(numspec)/2)])
  expect_gt(abs(meandiff[floor(ncol(numspec)/2)]), 3 * sd_half)
  
  # expect that cover by posterior approximate interval is about 
  # right as LV values simulate from are not extreme for a Gaussian
  # and the calculations marginalise of the LV value distribution
  ininterval <- (NumSpecies > numspec["Esum_det_margpost", ] - 2 * sqrt(numspec["Vsum_det_margpost", ])) & 
    (NumSpecies < numspec["Esum_det_margpost", ] + 2 * sqrt(numspec["Vsum_det_margpost", ]))
  expect_gt(mean(ininterval), 0.95)
  
  ######################################### Now try using marginalised LV simulations #####################################
  NumSpecies <- detectednumspec(y = simulate_fit(artfit, esttype = 1, UseFittedLV = FALSE), ModelSite = artfit$data$ModelSite)
  
  meandiff <- dplyr::cummean(NumSpecies - numspec["Esum_det_median", ])
  meanvar <- cumsum(numspec["Vsum_det_median", ])/((1:ncol(numspec))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error 
  sd_final <- sqrt(meanvar[ncol(numspec)])
  expect_equal(meandiff[ncol(numspec)], 0, tol = 3 * sd_final)
  
  # difference between expected and observed should be zero on average; check that is getting closer with increasing data
  expect_lt(abs(meandiff[length(meandiff)]), max(abs(meandiff[floor(length(meandiff) / 20) + 1:20 ])))
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  ininterval <- (NumSpecies > numspec["Esum_det_margpost", ] - 2 * sqrt(numspec["Vsum_det_margpost", ])) & 
    (NumSpecies < numspec["Esum_det_margpost", ] + 2 * sqrt(numspec["Vsum_det_margpost", ]))
  expect_gt(mean(ininterval), 0.99)
  # I suspect this is not 0.95 because the Gaussian approximation may not work: the variance is enormous compared to the allowed range of number of species
  
  plt <- cbind(NumSpecies = NumSpecies, pred = numspec["Esum_det_margpost", ], se = sqrt(numspec["Vsum_det_margpost", ])) %>% 
    dplyr::as_tibble() %>% 
    dplyr::mutate(resid = NumSpecies - pred) %>%
    dplyr::arrange(resid) %>%
    tibble::rowid_to_column(var = "rowid") %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= rowid, ymin = -2 * se, ymax = + 2 * se), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = rowid, y = NumSpecies - pred), col = "blue", lwd = 2)
  # print(plt)
})

test_that("In sample data; no LV", {
  nsites <- 10000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 3, nlv = 0)
  artfit$mcmc[[1]] <- rbind(artfit$mcmc[[1]][1, ], artfit$mcmc[[1]][1, ])
  
  Enumspecdet <- predsumspecies(artfit, UseFittedLV = FALSE, type = "marginal")
  expect_equal(ncol(Enumspecdet), nsites)
  
  NumSpecies <- detectednumspec(y = artfit$data$y, ModelSite = artfit$data$ModelSite)
  
  meandiff <- dplyr::cummean(NumSpecies - Enumspecdet["Esum_det", ])
  meanvar <- cumsum(Enumspecdet["Vsum_det", ])/((1:ncol(Enumspecdet))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error once the software is computed
  sd_final <- sqrt(meanvar[ncol(Enumspecdet)])
  expect_equal(meandiff[ncol(Enumspecdet)], 0, tol = 3 * sd_final)
  
  # difference between expected and observed should be zero on average; check that is getting closer with increasing data
  expect_lt(abs(meandiff[length(meandiff)]), max(abs(meandiff[floor(length(meandiff) / 20) + 1:20 ])))
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  ininterval <- (NumSpecies > Enumspecdet["Esum_det", ] - 2 * sqrt(Enumspecdet["Vsum_det", ])) & 
    (NumSpecies < Enumspecdet["Esum_det", ] + 2 * sqrt(Enumspecdet["Vsum_det", ]))
  expect_equal(mean(ininterval), 0.95, tol = 0.05)
})


test_that("Holdout data; has LVs", {
  nsites <- 10000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 3, nlv = 4) #means LV nearly don't matter
  artfit$mcmc[[1]] <- rbind(artfit$mcmc[[1]][1, ], artfit$mcmc[[1]][1, ])
  
  originalXocc <- unstandardise.designmatprocess(artfit$XoccProcess, artfit$data$Xocc)
  originalXocc <- cbind(ModelSite = 1:nrow(originalXocc), originalXocc)
  originalXobs <- unstandardise.designmatprocess(artfit$XobsProcess, artfit$data$Xobs)
  originalXobs <- cbind(ModelSite = artfit$data$ModelSite, originalXobs)
  outofsample_y <- simulate_fit(artfit, esttype = 1, UseFittedLV = FALSE)
  
  Enumspec <- predsumspecies_newdata(artfit, originalXocc, originalXobs, ModelSiteVars = "ModelSite", chains = NULL, nLVsim = 1000, type = "median", cl = NULL)
  
  expect_equal(ncol(Enumspec), nsites)
  
  NumSpecies <- detectednumspec(y = outofsample_y, ModelSite = originalXobs[, "ModelSite"])
  
  meandiff <- dplyr::cummean(NumSpecies - Enumspec["Esum_det_median", ])
  meanvar <- cumsum(Enumspec["Vsum_det_median", ])/((1:ncol(Enumspec))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error once the software is computed
  sd_final <- sqrt(meanvar[ncol(Enumspec)])
  expect_equal(meandiff[ncol(Enumspec)], 0, tol = 3 * sd_final)
  
  # difference between expected and observed should be zero on average; check that is getting closer with increasing data
  expect_lt(abs(meandiff[length(meandiff)]), max(abs(meandiff[floor(length(meandiff) / 100) + 1:20 ])))
  
  Enum_compare_sum <- Enum_compare(NumSpecies,
                                   as.matrix(Enumspec["Esum_det_median", ], ncol = 1),
                                   as.matrix(Enumspec["Vsum_det_median", ], ncol = 1)
  )
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_model"]])
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_obs"]])
  expect_equivalent(Enum_compare_sum[["V[D]_model"]], Enum_compare_sum[["V[D]_obs"]], tol = 0.05 * Enum_compare_sum[["V[D]_obs"]])
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  ininterval <- (NumSpecies > Enumspec["Esum_det_margpost", ] - 2 * sqrt(Enumspec["Vsum_det_margpost", ])) & 
    (NumSpecies < Enumspec["Esum_det_margpost", ] + 2 * sqrt(Enumspec["Vsum_det_margpost", ]))
  expect_equal(mean(ininterval), 0.95, tol = 0.05)
})



test_that("Holdout data; no LVs", {
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 3, nlv = 0) #means LV nearly don't matter
  artfit$mcmc[[1]] <- rbind(artfit$mcmc[[1]][1, ], artfit$mcmc[[1]][1, ])
  
  originalXocc <- unstandardise.designmatprocess(artfit$XoccProcess, artfit$data$Xocc)
  originalXocc <- cbind(ModelSite = 1:nrow(originalXocc), originalXocc)
  originalXobs <- unstandardise.designmatprocess(artfit$XobsProcess, artfit$data$Xobs)
  originalXobs <- cbind(ModelSite = artfit$data$ModelSite, originalXobs)
  outofsample_y <- simulate_fit(artfit, esttype = 1, UseFittedLV = FALSE)
  
  Enumspec <- predsumspecies_newdata(artfit, originalXocc, originalXobs, ModelSiteVars = "ModelSite", chains = NULL, nLVsim = 1000, type = "marginal", cl = NULL)
  
  expect_equal(ncol(Enumspec), nsites)
  
  NumSpecies <- detectednumspec(y = outofsample_y, ModelSite = originalXobs[, "ModelSite"])
  
  meandiff <- dplyr::cummean(NumSpecies - Enumspec["Esum_det", ])
  meanvar <- cumsum(Enumspec["Vsum_det", ])/((1:ncol(Enumspec))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error once the software is computed
  sd_final <- sqrt(meanvar[ncol(Enumspec)])
  expect_equal(meandiff[ncol(Enumspec)], 0, tol = 3 * sd_final)
  
  # difference between expected and observed should be zero on average; check that is getting closer with increasing data
  expect_lt(abs(meandiff[length(meandiff)]), max(abs(meandiff[floor(length(meandiff) / 100) + 1:20 ])))
  
  Enum_compare_sum <- Enum_compare(NumSpecies,
                                   as.matrix(Enumspec["Esum_det", ], ncol = 1),
                                   as.matrix(Enumspec["Vsum_det", ], ncol = 1)
  )
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_model"]])
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_obs"]])
  expect_equivalent(Enum_compare_sum[["V[D]_model"]], Enum_compare_sum[["V[D]_obs"]], tol = 0.05 * Enum_compare_sum[["V[D]_obs"]])
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  Enumspecdet <- predsumspecies_newdata(artfit, originalXocc, originalXobs, ModelSiteVars = "ModelSite", chains = NULL, nLVsim = 1000, type = "marginal")
  ininterval <- (NumSpecies > Enumspecdet["Esum_det", ] - 2 * sqrt(Enumspecdet["Vsum_det", ])) & 
    (NumSpecies < Enumspecdet["Esum_det", ] + 2 * sqrt(Enumspecdet["Vsum_det", ]))
  expect_equal(mean(ininterval), 0.95, tol = 0.05)
})

test_that("Subset biodiversity matches simulations", {
  # make it full test by having: different draws and latent variables, and testing both marginal and fitted latent variables
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 1, nlv = 4)
  
  # make a new, different, second parameter set
  artfit$mcmc[[2]] <- artfit$mcmc[[1]]
  artfit$sample <- 1
  bugvarnames <- names(artfit$mcmc[[1]][1, ])
  artfit$mcmc[[2]][1, grepl("^u.b\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^u.b\\[.*", bugvarnames)] * runif(3, min = 5, max = 10)
  artfit$mcmc[[2]][1, grepl("^v.b\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^v.b\\[.*", bugvarnames)] * runif(3, min = 5, max = 10)
  artfit$mcmc[[2]][1, grepl("^lv.coef\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^lv.coef\\[.*", bugvarnames)] * runif(3, min = 0.1, max = 0.2)
  artfit$mcmc[[2]][1, grepl("^LV\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^LV\\[.*", bugvarnames)] * runif(4, min = 0.5, max = 1)
   
  # Simulate equally from each draw
  y_1st <- simulate_fit(artfit, esttype = 1, UseFittedLV = TRUE)
  y_2nd <- simulate_fit(artfit, esttype = 2, UseFittedLV = TRUE)
  y_interleaved <- y_1st
  drawselect <- as.logical(rbinom(1000, size = 1, prob = 0.5))
  y_interleaved[drawselect, ] <- y_2nd[drawselect, ]
  
  # Choose a subset of species
  speciessubset <- sample(artfit$species, size = 20)
  NumSpeciesObs <- detectednumspec(y_interleaved[, speciessubset], ModelSite = artfit$data$ModelSite)
  
  # Predict number within subset, in sample, using LV
  numspec_insample_fitLV <- predsumspecies(artfit, desiredspecies = speciessubset, UseFittedLV = TRUE, type = "marginal")
  inci_insample_fitLV <- (NumSpeciesObs > numspec_insample_fitLV["Esum_det", ] - 2 * sqrt(numspec_insample_fitLV["Vsum_det", ])) & 
    (NumSpeciesObs < numspec_insample_fitLV["Esum_det", ] + 2 * sqrt(numspec_insample_fitLV["Vsum_det", ]))
  expect_equal(mean(inci_insample_fitLV), 0.95, tol = 0.05)
  
  # Predict number within subset, in sample, marginal LV
  numspec_insample_margLV <- predsumspecies(artfit, desiredspecies = speciessubset, UseFittedLV = FALSE, type = "marginal")
  inci_insample_margLV <- (NumSpeciesObs > numspec_insample_margLV["Esum_det", ] - 2 * sqrt(numspec_insample_margLV["Vsum_det", ])) & 
    (NumSpeciesObs < numspec_insample_margLV["Esum_det", ] + 2 * sqrt(numspec_insample_margLV["Vsum_det", ]))
  expect_equal(mean(inci_insample_margLV), 0.95, tol = 0.05)
  
  # Predict number within subset, outside sample, marginal LV
  originalXocc <- unstandardise.designmatprocess(artfit$XoccProcess, artfit$data$Xocc)
  originalXocc <- cbind(ModelSite = 1:nrow(originalXocc), originalXocc)
  originalXobs <- unstandardise.designmatprocess(artfit$XobsProcess, artfit$data$Xobs)
  originalXobs <- cbind(ModelSite = artfit$data$ModelSite, originalXobs)
  outofsample_y <- simulate_fit(artfit, esttype = 1, UseFittedLV = FALSE)
  
  numspec_holdout_margLV <- predsumspecies_newdata(artfit, originalXocc, originalXobs, ModelSiteVars = "ModelSite",
                                     desiredspecies = speciessubset,
                                     chains = NULL, nLVsim = 1000, type = "marginal", cl = NULL)
  
  inci_holdout_margLV <- (NumSpeciesObs > numspec_holdout_margLV["Esum_det", ] - 2 * sqrt(numspec_holdout_margLV["Vsum_det", ])) & 
    (NumSpeciesObs < numspec_holdout_margLV["Esum_det", ] + 2 * sqrt(numspec_holdout_margLV["Vsum_det", ]))
  expect_equal(mean(inci_holdout_margLV), 0.95, tol = 0.05)
})

test_that("Subset biodiversity to single species matches simulations", {
  # make it full test by having: different draws and latent variables, and testing both marginal and fitted latent variables
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 1, nlv = 4)
  
  # make a new, different, second parameter set
  artfit$mcmc[[2]] <- artfit$mcmc[[1]]
  artfit$sample <- 1
  bugvarnames <- names(artfit$mcmc[[1]][1, ])
  artfit$mcmc[[2]][1, grepl("^u.b\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^u.b\\[.*", bugvarnames)] * runif(3, min = 5, max = 10)
  artfit$mcmc[[2]][1, grepl("^v.b\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^v.b\\[.*", bugvarnames)] * runif(3, min = 5, max = 10)
  artfit$mcmc[[2]][1, grepl("^lv.coef\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^lv.coef\\[.*", bugvarnames)] * runif(3, min = 0.1, max = 0.2)
  artfit$mcmc[[2]][1, grepl("^LV\\[.*", bugvarnames)] <- artfit$mcmc[[1]][1, grepl("^LV\\[.*", bugvarnames)] * runif(4, min = 0.5, max = 1)
  
  # Simulate equally from each draw
  y_1st <- simulate_fit(artfit, esttype = 1, UseFittedLV = TRUE)
  y_2nd <- simulate_fit(artfit, esttype = 2, UseFittedLV = TRUE)
  y_interleaved <- y_1st
  drawselect <- as.logical(rbinom(1000, size = 1, prob = 0.5))
  y_interleaved[drawselect, ] <- y_2nd[drawselect, ]
  
  # Choose a subset of species
  speciessubset <- sample(artfit$species, size = 1)
  NumSpeciesObs <- detectednumspec(y_interleaved[, speciessubset, drop = FALSE], ModelSite = artfit$data$ModelSite)
  
  # Predict number within subset, in sample, using LV
  numspec_insample_fitLV <- predsumspecies(artfit, desiredspecies = speciessubset, UseFittedLV = TRUE, type = "marginal")
  Enum_compare_sum <- Enum_compare(NumSpeciesObs,
                                   as.matrix(numspec_insample_fitLV["Esum_det", ], ncol = 1),
                                   as.matrix(numspec_insample_fitLV["Vsum_det", ], ncol = 1)
  )
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_model"]])
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_obs"]])
  expect_equivalent(Enum_compare_sum[["V[D]_model"]], Enum_compare_sum[["V[D]_obs"]], tol = 0.05 * Enum_compare_sum[["V[D]_obs"]])

  # Predict number within subset, in sample, marginal LV
  numspec_insample_margLV <- predsumspecies(artfit, desiredspecies = speciessubset, UseFittedLV = FALSE, type = "marginal")
  Enum_compare_sum <- Enum_compare(NumSpeciesObs,
                                   as.matrix(numspec_insample_margLV["Esum_det", ], ncol = 1),
                                   as.matrix(numspec_insample_margLV["Vsum_det", ], ncol = 1)
  )
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_model"]])
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_obs"]])
  # the follow test doesn't pass, which is consistent with the fitted LV values not being distributed according to a Gaussian distribution
  # expect_equivalent(Enum_compare_sum[["V[D]_model"]], Enum_compare_sum[["V[D]_obs"]], tol = 0.05 * Enum_compare_sum[["V[D]_obs"]])
  
  # Predict number within subset, outside sample, marginal LV
  originalXocc <- unstandardise.designmatprocess(artfit$XoccProcess, artfit$data$Xocc)
  originalXocc <- cbind(ModelSite = 1:nrow(originalXocc), originalXocc)
  originalXobs <- unstandardise.designmatprocess(artfit$XobsProcess, artfit$data$Xobs)
  originalXobs <- cbind(ModelSite = artfit$data$ModelSite, originalXobs)
  outofsample_y <- simulate_fit(artfit, esttype = 1, UseFittedLV = FALSE)
  
  numspec_holdout_margLV <- predsumspecies_newdata(artfit, originalXocc, originalXobs, ModelSiteVars = "ModelSite",
                                                   desiredspecies = speciessubset,
                                                   chains = NULL, nLVsim = 1000, type = "marginal", cl = NULL)
  Enum_compare_sum <- Enum_compare(NumSpeciesObs,
                                   as.matrix(numspec_holdout_margLV["Esum_det", ], ncol = 1),
                                   as.matrix(numspec_holdout_margLV["Vsum_det", ], ncol = 1)
  )
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_model"]])
  expect_equivalent(Enum_compare_sum[["E[D]_obs"]], 0, tol = 3 * Enum_compare_sum[["SE(E[D]_obs)_obs"]])
  # the follow test doesn't pass, which is consistent with the fitted LV values not being distributed according to a Gaussian distribution
  # expect_equivalent(Enum_compare_sum[["V[D]_model"]], Enum_compare_sum[["V[D]_obs"]], tol = 0.05 * Enum_compare_sum[["V[D]_obs"]])
})

test_that("Endetect_modelsite matches predsumspecies", {
  # make it full test by having: different draws and latent variables, and testing both marginal and fitted latent variables
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 1, nlv = 4)
  
  # using fitted LV
  Edet1 <- Endetect_modelsite(artfit, type = "median", conditionalLV = TRUE)
  Edet2 <- lapply(artfit$species, function(sp) {
                  Edet <- predsumspecies(artfit, desiredspecies = sp, UseFittedLV = TRUE, type = "marginal")
                  #marginal works here because artfit has only one draw
                  return(Edet)}
                  )
  Edet2_t <- t(do.call(rbind, lapply(Edet2, function(x) x["Esum_det", , drop = FALSE])))
  expect_equivalent(Edet1[[1]], Edet2_t)
  
  Edet2_V_t <- t(do.call(rbind, lapply(Edet2, function(x) x["Vsum_det", , drop = FALSE])))
  expect_equivalent(Edet1[[2]], Edet2_V_t)
  
  # marginal to LV
  Edet1 <- Endetect_modelsite(artfit, type = "median", conditionalLV = FALSE)
  Edet2 <- lapply(artfit$species, function(sp) {
    Edet <- predsumspecies(artfit, desiredspecies = sp, UseFittedLV = FALSE, nLVsim = 5000, type = "marginal")
    #marginal works here because artfit has only one draw
    return(Edet)}
  )
  Edet2_t <- t(do.call(rbind, lapply(Edet2, function(x) x["Esum_det", , drop = FALSE])))
  expect_equivalent(Edet1[[1]], Edet2_t, tol = 1E-2)
  
  Edet2_V_t <- t(do.call(rbind, lapply(Edet2, function(x) x["Vsum_det", , drop = FALSE])))
  expect_equivalent(Edet1[[2]], Edet2_V_t, tol = 1E-2)
})

#########################################################################################

test_that("No LV and identical sites", {
  artfit <- artificial_runjags(nspecies = 4, nsites = 1000, nvisitspersite = 2, nlv = 0,
                               ObsFmla = "~ 1",
                               OccFmla = "~ 1")
  EVsum <- predsumspecies(artfit, UseFittedLV = FALSE, type = "marginal")
  
  # check that many other sites have the same expected number of species
  expect_equivalent(EVsum["Esum_det", ], rep(EVsum["Esum_det", 1], ncol(EVsum)))
  
  # treat each model site as a repeat simulation of a ModelSite (cos all the parameters are nearly identical)
  NumSpecies <- detectednumspec(y = artfit$data$y, ModelSite = artfit$data$ModelSite)
  
  meandiff <- dplyr::cummean(NumSpecies - EVsum["Esum_det", ])
  meanvar <- cumsum(EVsum["Vsum_det", ])/((1:ncol(EVsum))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error once the software is computed
  sd_final <- sqrt(meanvar[ncol(EVsum)])
  expect_equal(meandiff[ncol(EVsum)], 0, tol = 3 * sd_final)
  
  # difference between expected and observed should be zero on average; check that is getting closer with increasing data
  expect_lt(abs(meandiff[length(meandiff)]), max(abs(meandiff[floor(length(meandiff) / 20) + 1:20 ])))

  # Expect sd to be close to theoretical sd. Hopefully within 10%
  expect_equivalent(sd(NumSpecies), sqrt(EVsum["Vsum_det", 1]), tol = 0.1 * sqrt(EVsum["Vsum_det", 1]))
  
  # Hope that Gaussian approximation of a 95% interval covers the observed data 95% of the time
  Enumspecdet <- predsumspecies(artfit, type = "marginal", UseFittedLV = FALSE)
  ininterval <- (NumSpecies > Enumspecdet["Esum_det", ] - 2 * sqrt(Enumspecdet["Vsum_det", ])) & 
    (NumSpecies < Enumspecdet["Esum_det", ] + 2 * sqrt(Enumspecdet["Vsum_det", ]))
  expect_equal(mean(ininterval), 0.95, tol = 0.05)
})

test_that("Expected occupied number for in sample data; fitted LV values", {
  nsites <- 1000
  artfit <- artificial_runjags(nspecies = 60, nsites = nsites, nvisitspersite = 3, nlv = 4,
                               v.b.min = 20, v.b.max = 20.1) #detection is still not certain
  artfit$mcmc[[1]] <- rbind(artfit$mcmc[[1]][1, ], artfit$mcmc[[1]][1, ])
  
  Enumspecdet <- predsumspecies(artfit, UseFittedLV = TRUE, type = "marginal")
  expect_equal(ncol(Enumspecdet), nsites)
  
  NumSpecies <- detectednumspec(y = artfit$data$y, ModelSite = artfit$data$ModelSite)
  
  meandiff <- dplyr::cummean(NumSpecies - Enumspecdet["Esum_det", ])
  meanvar <- cumsum(Enumspecdet["Vsum_det", ])/((1:ncol(Enumspecdet))^2)
  plt <- cbind(diff = meandiff, var  = meanvar) %>% 
    dplyr::as_tibble(rownames = "CumSites") %>% 
    dplyr::mutate(CumSites = as.double(CumSites)) %>%
    ggplot2::ggplot() +
    ggplot2::geom_ribbon(ggplot2::aes(x= CumSites, ymin = -2 * sqrt(var), ymax = 2 * sqrt(var)), fill = "grey") +
    ggplot2::geom_line(ggplot2::aes(x = CumSites, y = diff), col = "blue", lwd = 2)
  # print(plt)
  
  # check with predicted standard error once the software is computed
  sd_final <- sqrt(meanvar[ncol(Enumspecdet)])
  expect_equal(meandiff[ncol(Enumspecdet)], 0, tol = 3 * sd_final)
})
sustainablefarms/linking-data documentation built on Oct. 28, 2020, 2:41 a.m.