R/example_zero_truncation.R

# Illustration of the zero-truncation from the poilog package

# Note: it is not efficient to use the zero-adjusted algorithm on a single
# univariate or bivariate distribution, it is much better to use the poilog
# package in these cases. The benefit of the zero-adjusted algorithm is for
# multivariate cases.

# Univariate case ####

# Get the same result, i.e. figure, every time.
set.seed(1)

# Generate the (true) log abundances
x <- rnorm(100, 1, 2)

# Generate the abundances
n <- rpois(100, lambda = exp(x))

# Make a table for the data
data_tbl <- tibble(x = x,
                   n = n,
                   z = n > 0)

# Fit the non-truncated Poisson lognormal distribution using 'poilog'
# (dotted line in Figure 1A)
m.fit <- poilogMLE(n = n[n>0], zTrunc = FALSE)

# Fit the zero-truncated Poisson lognormal distribution using 'poilog'
# (dashed line in Figure 1A)
mzt.fit <- poilogMLE(n = n[n>0], zTrunc = TRUE)

# Add species variable for analysing in glmmTMB
data_tbl <- data_tbl %>%
  add_column(species = 1:100)

# tmbzt.fit = "(glmm) template model builder zero truncated fit"
tmbzt.fit <- glmmTMB(n ~ (1|species),
                     family = truncated_poisson(link = "log"),
                     data = filter(data_tbl, z))

# k_boot: the number of species simulated, the higher the number, the
# more accurate the zero adjustment will be
k_boot <- 200
# res_cor: a vector to store the last iteration step for each replicate
# run of the zero-adjustment
res_cor <- c()
# m: we run the zero-adjusted algorithm 20 times (as mentioned above, this is
# not an efficient approach in one dimension)
for (m in 1:20){
  # res: here we store the estimates at each iteration step
  res <- c()
  # res[1]: the first value is the original estimate from glmmTMB
  res <- rbind(res, tmbzt.fit$fit$par)
  # i: we do 10 iteration steps, usually its usefull to run a few replicates
  # with higher number of iterations and determine how long they have to be
  for (i in 1:10){
    # simulate new abundances from a non-truncated poisson lognormal
    # distribution given the current adjusted values from the zero-truncated
    # GLMM estimates. keep0 = FALSE as we don't need those in the estimation
    n_boot <- rpoilog(k_boot, res[i, 1], exp(res[i, 2]), keep0 = FALSE)
    # Fit a new zero-truncated GLMM to the simulated data
    tmbzt.boot.fit <- glmmTMB(n ~ (1|species),
                              family = truncated_poisson(link = "log"),
                              data = tibble(n = n_boot,
                                            species = 1:length(n_boot)))
    # adjust the zero-truncated estimates
    tmp_res <- res[i, ] + res[1, ] - tmbzt.boot.fit$fit$par
    # add the result to the current replicate
    res <- rbind(res, tmp_res)
  }
  # store the final value of the replicate
  res_cor <- rbind(res_cor, res[11, ])
}
# for the collection of (20) replicates, we compute the mean as the zero-
# adjusted estimate
tmbzt.adj.fit <- apply(res_cor, 2, mean)

# Compare estimates, note the similarity between the 2nd and 4th estimates
m.fit$par # non-truncated 'poilog'
mzt.fit$par # zero-truncated 'poilog'
c(tmbzt.fit$fit$par[1], exp(tmbzt.fit$fit$par[2])) # zero-truncated glmm
c(tmbzt.adj.fit[1], exp(tmbzt.adj.fit[2])) # zero-adjusted glmm

# Make a figure of the data, marking observed (n > 0) abundances as grey
# and add all four estimated curves. In the background there is a shade of
# the theoretical (true) distribution
fig1a <- data_tbl %>%
  ggplot(aes(x = x)) +
  # theoretical distribution
  geom_area(data = tibble(x = seq(-5, 8, length.out = 100),
                          y = dnorm(x, 1, 2) * 100),
            aes(x = x, y = y), alpha = 0.15) +
  # abundances
  geom_histogram(aes(x = x, fill = as.factor(z)),
                 bins = 10, colour = "black", alpha = 0.8) +
  # non-truncated (poisson) lognormal distribution
  stat_function(fun = function(x, mean, sd){100*dnorm(x, mean, sd)},
                args = list(mean = m.fit$par[1],
                            sd = m.fit$par[2]),
                linetype = 3, size = 1) +
  # zero-truncated (poisson) lognormal distribution
  stat_function(fun = function(x, mean, sd){100*dnorm(x, mean, sd)},
                args = list(mean = mzt.fit$par[1],
                            sd = mzt.fit$par[2]),
                linetype = 2, size = 1) +
  # zero-truncated (glmm) lognormal distribution
  stat_function(fun = function(x, mean, sd){100*dnorm(x, mean, sd)},
                args = list(mean = tmbzt.fit$fit$par[1],
                            sd = exp(tmbzt.fit$fit$par[2])),
                linetype = 1, size = 1) +
  # zero-adjusted (glmm) lognormal distribution
  stat_function(fun = function(x, mean, sd){100*dnorm(x, mean, sd)},
                args = list(mean = tmbzt.adj.fit[1],
                            sd = exp(tmbzt.adj.fit[2])),
                linetype = 1, size = 1) +
  # aesthetics
  theme_bw() +
  scale_fill_manual(values = c("white", "grey")) +
  labs(x = "Log abundance", y = "Observerd frequency") +
  coord_cartesian(xlim = c(-4, 7)) +
  guides(fill = "none")

# Bivariate case ####

# Get the same result, i.e. figure, every time.
set.seed(1)

# The total number of species
n_sp <- 100

# Generate the (true) log abundances
x2 <- mvrnorm(n = n_sp,
              mu = c(1, 1),
              Sigma = matrix(data = c(2, 1.5,
                                      1.5, 2), nrow = 2))

# Generate the abundances
n2 <- matrix(data = rpois(n_sp * 2, lambda = exp(x2)), ncol = 2)

# Make a table for the data
data2_tbl <- tibble(x = c(x2),
                    n = c(n2),
                    z = n > 0,
                    species = rep(1:n_sp, 2),
                    set = rep(1:2, each = n_sp))

# Fit the bivariate non-truncated Poisson lognormal distribution using 'poilog'
# (dotted ellipse in Figure 1B)
m2.fit <- bipoilogMLE(n1 = n2[-which(n2[,1]==0 & n2[, 2]==0), ],
                      zTrunc = FALSE)

# Fit the bivariate zero-truncated Poisson lognormal distribution using
# 'poilog' (dashed ellipse in Figure 1B)
m2zt.fit <- bipoilogMLE(n1 = n2[-which(n2[,1]==0 & n2[, 2]==0), ],
                        zTrunc = TRUE)

# tmb2zt.fit = "(glmm) template model builder bi(2) variate zero truncated fit"
tmb2zt.fit <- glmmTMB(n ~ (1|species:set) +
                        (1|species),
                      family = truncated_poisson(link = "log"),
                      data = filter(data2_tbl, z))

# k_boot: the number of species simulated, the higher the number, the
# more accurate the zero adjustment will be
k_boot <- 200
# res_cor2: a vector to store the last iteration step for each replicate
# run of the zero-adjustment
res_cor2 <- c()
# m: we run the zero-adjusted algorithm 20 times (as mentioned above, this is
# not an efficient approach in two dimensions)
for (m in 1:20){
  # res2: here we store the estimates at each iteration step
  res2 <- c()
  # res2[1]: the first value is the original estimate from glmmTMB
  res2 <- rbind(res2, tmb2zt.fit$fit$par)
  # i: we do 10 iteration steps, usually its usefull to run a few replicates
  # with higher number of iterations and determine how long they have to be
  for (i in 1:10){
    # simulate new abundances from a non-truncated bivariate Poisson lognormal
    # distribution given the current adjusted values from the zero-truncated
    # GLMM estimates. First the (underlying) log abundances
    x2_boot <- mvrnorm(n = k_boot,
                       mu = rep(res2[i, 1], 2),
                       Sigma = matrix(
                         data = exp(res2[i, 3])^2 +
                           c(exp(res2[i, 2])^2, 0,
                             0, exp(res2[i, 2])^2),
                         nrow = 2))
    # Generate the abundances
    n2_boot <- matrix(data = rpois(k_boot*2, lambda = exp(x2_boot)), ncol = 2)
    # Make a table
    data2boot_tbl <- tibble(x = c(x2_boot),
                            n = c(n2_boot),
                            z = n > 0,
                            species = rep(1:k_boot, 2),
                            set = rep(1:2, each = k_boot))
    # Fit a new zero-truncated GLMM to the simulated data
    tmb2zt.boot.fit <- glmmTMB(n ~ (1|species:set) +
                                 (1|species),
                               family = truncated_poisson(link = "log"),
                               data = filter(data2boot_tbl, z))
    # adjust the zero-truncated estimates
    tmp2_res <- res2[i, ] + res2[1, ] - tmb2zt.boot.fit$fit$par
    # add the result to the current replicate
    res2 <- rbind(res2, tmp2_res)
  }
  # store the final value of the replicate
  res_cor2 <- rbind(res_cor2, res2[11, ])
}
# for the collection of (20) replicates, we compute the mean as the zero-
# adjusted estimate
tmb2zt.adj.fit <- apply(res_cor2, 2, mean)
# Compare estimates, note the similarity between the 2nd and 4th estimates
m2.fit$par # non-truncated 'poilog'
m2zt.fit$par # zero-truncated 'poilog'
c(tmb2zt.fit$fit$par[c(1, 1)],
  rep(sqrt(sum(exp(tmb2zt.fit$fit$par[c(2:3)])^2)), 2),
  exp(tmb2zt.fit$fit$par[3]) ^ 2 /
    sum(exp(tmb2zt.fit$fit$par[c(2:3)])^2)) # zero-truncated glmm
c(tmb2zt.adj.fit[c(1, 1)],
  rep(sqrt(sum(exp(tmb2zt.adj.fit[c(2:3)])^2)), 2),
  exp(tmb2zt.adj.fit[3]) ^ 2 /
    sum(exp(tmb2zt.adj.fit[c(2:3)])^2)) # zero-adjusted glmm

# To plot the contours, see:
# https://stackoverflow.com/questions/36221596/plot-multivariate-gaussian-contours-with-ggplot2
library(plyr) # only for this example!

# Confidence level
alpha_levels <- 0.95
names(alpha_levels) <- alpha_levels

# True parameter values
contour_data <- ldply(alpha_levels,
                      ellipse,
                      x = matrix(c(1, 1.5/2,
                                   1.5/2, 1), nrow=2),
                      scale = rep(sqrt(2), 2),
                      centre = rep(1, 2))

# non-truncated Poisson lognormal
contour_data_m2 <- ldply(alpha_levels,
                         ellipse,
                         x = matrix(c(1, m2.fit$par[5],
                                      m2.fit$par[5], 1), nrow=2),
                         scale = m2.fit$par[3:4],
                         centre = m2.fit$par[1:2])

# zero-truncated Poisson lognormal
contour_data_m2zt <- ldply(alpha_levels,
                           ellipse,
                           x = matrix(c(1, m2zt.fit$par[5],
                                        m2zt.fit$par[5], 1), nrow=2),
                           scale = m2zt.fit$par[3:4],
                           centre = m2zt.fit$par[1:2])

# zero-truncated GLMM
contour_data_tmb2zt <- ldply(alpha_levels,
                             ellipse,
                             x = matrix(data = exp(tmb2zt.fit$fit$par[3])^2 +
                                          c(exp(tmb2zt.fit$fit$par[2])^2, 0,
                                            0, exp(tmb2zt.fit$fit$par[2])^2),
                                        nrow = 2) /
                               (exp(tmb2zt.fit$fit$par[3])^2 +
                                  exp(tmb2zt.fit$fit$par[2])^2),
                             scale = rep(
                               sqrt(sum(exp(tmb2zt.fit$fit$par[2:3])^2)),
                               2),
                             centre = rep(tmb2zt.fit$fit$par[1], 2))

# zero-adjusted GLMM
contour_data_tmb2zt_adj <- ldply(alpha_levels,
                                 ellipse,
                                 x = matrix(data = exp(tmb2zt.adj.fit[3])^2 +
                                              c(exp(tmb2zt.adj.fit[2])^2, 0,
                                                0, exp(tmb2zt.adj.fit[2])^2),
                                            nrow = 2) /
                                   (exp(tmb2zt.adj.fit[3])^2 +
                                      exp(tmb2zt.adj.fit[2])^2),
                                 scale = rep(
                                   sqrt(sum(exp(tmb2zt.adj.fit[2:3])^2)),
                                   2),
                                 centre = rep(tmb2zt.adj.fit[1], 2))


# Make a figure of the data, marking observed (n > 0) abundances in the first
# population as grey and observations in the second populatoin as triangles.
# Add all four estimated ellipses. In the background there is a shade of
# the theoretical (true) distribution
fig1b <- data2_tbl %>%
  pivot_wider(names_from = set, values_from = x:z) %>%
  ggplot(aes(x = x_1, y = x_2, fill = as.factor(z_1))) +
  # True distribution
  geom_polygon(data = contour_data, aes(x, y, group=.id, fill = NULL), alpha = 0.15) +
  # non-truncated Poisson lognormal
  geom_path(data = contour_data_m2, aes(x, y, group=.id, fill = NULL), linetype = 3, size = 1) +
  # zero-truncated Poisson lognormal
  geom_path(data = contour_data_m2zt, aes(x, y, group=.id, fill = NULL), linetype = 2, size = 1) +
  # zero-truncated GLMM
  geom_path(data = contour_data_tmb2zt, aes(x, y, group=.id, fill = NULL), size = 1) +
  # zero-adjusted GLMM
  geom_path(data = contour_data_tmb2zt_adj, aes(x, y, group=.id, fill = NULL), size = 1) +
  # Observations
  geom_point(aes(shape = as.factor(z_2)), size = 5, alpha = 0.8) +
  # aesthetics
  scale_shape_manual(values = c(21, 24)) +
  scale_fill_manual(values = c("white", "grey")) +
  theme_bw() +
  labs(x = "Log abundance in population 1",
       y = "Log abundance in population 2") +
  guides(shape = "none",
         fill = "none")

# Unload package
detach("package:plyr", unload = TRUE)

# Plot both figures in one go!
plot_grid(fig1a, fig1b, labels = c('A', 'B'))

# ggsave("zero_truncation_illustration.pdf", width = 8, height = 5)
erikbsolbu/dynamicSAD documentation built on Jan. 25, 2021, 5 a.m.