dynamicSAD:

# Estimating dynamic species abundance model using the 'poilog' package

# Estimating univariate Poisson lognormal distributions ####

## f_poilog_yl: function poilog year-location ####
# for each observer/replicate at each unique year-location combination, we fit
# a univariate poisson lognormal distribtion. The mean value of the estimated
# variances are used as estimate of the total variance which is partitioned
# based on correlations that are estimated below.

f_poilog_yl <- function(data){

  result <- c()

  # obs: repeated measurements
  for (obs in unique(data$observer)){
    # y_i: unique time points
    for (y_i in unique(data$year_m)){
      # l_i: unique locations
      for (l_i in unique(data$pos_m)){
        # find the corresponding subset of the data
        n_i <- data %>%
          filter(year_m == y_i,
                 pos_m == l_i,
                 observer == obs) %>%
          dplyr::select(species, abundance)

        # fit the univariate poisson lognormal distribution
        est_i <- try(poilogMLE(n = n_i$abundance))

        # if the estimation worked, store the result
        if (class(est_i) != "try-error"){
          est_i_tbl <- tibble(mu = est_i$par[1],
                              lsigma_e = log(est_i$par[2]),
                              p0_hat = 1 - est_i$p,
                              e_0 = NA)

          result <- bind_rows(result,
                              add_column(est_i_tbl,
                                         y_i = y_i,
                                         l_i = l_i,
                                         obs = obs),
          )
        }
        cat(paste(y_i, l_i, obs, sep = " / "), " \r")
      }
    }
  }

  return(result)

}


# Estimating bivariate Poisson lognormal distributions ####

## f_poilog_yll: function poilog year-location1-location2 ####
# for each observer/replicate at each unique year-location1-location2
# combination, we fit the bivariate poisson lognormal distribtion.
# The correlations from these estimates are used to determine the different
# proportions of variance, strength of density regulation and inverse spatial
# scaling

f_poilog_yll <- function(data){

  result_yll <- c()

  # obs: for each observer/replicate
  for (obs in unique(data$observer)){

    # y_i: for each unique time point
    for (y_i in unique(data$year_m)){

      # l_selected: all possible locations a given time point
      l_selected <- unique(data$pos_m)

      # l_i: for each location
      for (l_i in unique(data$pos_m)){

        # all remaining locations
        l_selected <- l_selected[l_selected != l_i]

        # data for the first community
        n_i <- data %>%
          filter(year_m == y_i,
                 pos_m == l_i,
                 observer == obs) %>%
          dplyr::select(species, abundance)

        # l_j: for each remaining location
        for (l_j in l_selected){

          # data for the second community
          n_j <- data %>%
            filter(year_m == y_i,
                   pos_m == l_j,
                   observer == obs) %>%
            dplyr::select(species, abundance)

          # join the two data sets together
          n_ij <- full_join(n_i, n_j, by = "species",
                            suffix = c(".i", ".j")) %>%
            mutate(abundance.i = ifelse(is.na(abundance.i), 0, abundance.i),
                   abundance.j = ifelse(is.na(abundance.j), 0, abundance.j))

          # fit the bivariate Poisson lognormal distribution
          est_ij <- try(bipoilogMLE(n1 = cbind(n_ij[, 2], n_ij[, 3])))

          # if the estimation worked, store the result
          if (class(est_ij) != "try-error"){
            est_ij_tbl <- tibble(mu_1 = est_ij$par[1],
                                 mu_2 = est_ij$par[2],
                                 lsigma_e_1 = log(est_ij$par[3]),
                                 lsigma_e_2 = log(est_ij$par[4]),
                                 us = NA,
                                 p0_hat = 1 - mean(est_ij$p),
                                 e_0 = NA,
                                 rho = est_ij$par[5])

            result_yll <- bind_rows(result_yll,
                                    add_column(est_ij_tbl,
                                               y_i = y_i,
                                               l_i = l_i,
                                               l_j = l_j,
                                               obs = obs),
            )
          }

          cat(paste(y_i, l_i, l_j, obs, sep = " / "), " \r")

        }

      }

    }
  }

  return(result_yll)

}

## f_poilog_yyll: function poilog year1-year2-location1-location2 ####
# for each observer/replicate at each unique year1-year2-location1-location2
# combination, we fit the bivariate poisson lognormal distribtion.
# The correlations from these estimates are used to determine the different
# proportions of variance, strength of density regulation and inverse spatial
# scaling

f_poilog_yyll <- function(data){

  result_yyll <- c()

  # obs: for each observer/replicate
  for (obs in unique(data$observer)){

    # y_selected: all possible time points
    y_selected <- unique(data$year_m)

    # y_i: for all years, except the last one, which will be included in all
    # the other combinations
    for (y_i in unique(data$year_m)[-n_distinct(data$year_m)]){

      # all remaining years
      y_selected <- y_selected[y_selected != y_i]

      # l_i: for all locations
      for (l_i in unique(data$pos_m)){

        # first data set
        n_i <- data %>%
          filter(year_m == y_i,
                 pos_m == l_i,
                 observer == obs) %>%
          dplyr::select(species, abundance)

        # y_j: for all remaining time points
        for (y_j in y_selected){

          # l_j: for all remaining locations
          for (l_j in unique(data$pos_m)){

            # second data set
            n_j <- data %>%
              filter(year_m == y_j,
                     pos_m == l_j,
                     observer == obs) %>%
              dplyr::select(species, abundance)

            # join data
            n_ij <- full_join(n_i, n_j, by = "species",
                              suffix = c(".i", ".j")) %>%
              mutate(abundance.i = ifelse(is.na(abundance.i), 0, abundance.i),
                     abundance.j = ifelse(is.na(abundance.j), 0, abundance.j))

            # fit the bivariate poisson lognormal distribution
            est_ij <- try(bipoilogMLE(n1 = cbind(n_ij[, 2], n_ij[, 3])))

            # if it worked, store the result
            if (class(est_ij) != "try-error"){
              est_ij_tbl <- tibble(mu_1 = est_ij$par[1],
                                   mu_2 = est_ij$par[2],
                                   lsigma_e_1 = log(est_ij$par[3]),
                                   lsigma_e_2 = log(est_ij$par[4]),
                                   us = NA,
                                   p0_hat = 1 - mean(est_ij$p),
                                   e_0 = NA,
                                   rho = est_ij$par[5])

              result_yyll <- bind_rows(result_yyll,
                                       add_column(est_ij_tbl,
                                                  y_i = y_i,
                                                  y_j = y_j,
                                                  l_i = l_i,
                                                  l_j = l_j,
                                                  obs = obs),
              )
            }

            cat(paste(y_i, y_j, l_i, l_j, obs, sep = " / "), " \r")

          }

        }

      }

    }
  }

  return(result_yyll)

}

## f_poilog_ylo: function poilog year-location-observer ####
# for each pair of observer/replicate at each unique year-location
# combination, we fit the bivariate poisson lognormal distribtion.
# The correlations from these estimates are used to determine the different
# proportions of variance, strength of density regulation and inverse spatial
# scaling

f_poilog_ylo <- function(data){

  result_ylo <- c()

  # y_i: for each time point
  for (y_i in unique(data$year_m)){

    # l_i: for each location
    for (l_i in unique(data$pos_m)){

      # o_selected: possible observers/replicates
      o_selected <- unique(filter(data,
                                  year_m == y_i,
                                  pos_m == l_i)$observer)

      # n_o: number of observers/replicates
      n_o <- n_distinct(filter(data,
                               year_m == y_i,
                               pos_m == l_i)$observer)

      # obs_i: for all observers, except the last one
      for (obs_i in unique(filter(data,
                                  year_m == y_i,
                                  pos_m == l_i)$observer)[-n_o]){

        # first data set
        n_i <- data %>%
          filter(year_m == y_i,
                 pos_m == l_i,
                 observer == obs_i) %>%
          dplyr::select(species, abundance)

        # remaining observers
        o_selected <- o_selected[o_selected != obs_i]

        # obs_j: for all remaining observers
        for (obs_j in o_selected){

          # second data set
          n_j <- data %>%
            filter(year_m == y_i,
                   pos_m == l_i,
                   observer == obs_j) %>%
            dplyr::select(species, abundance)

          # join data sets together
          n_ij <- full_join(n_i, n_j, by = "species", suffix = c(".i", ".j")) %>%
            mutate(abundance.i = ifelse(is.na(abundance.i), 0, abundance.i),
                   abundance.j = ifelse(is.na(abundance.j), 0, abundance.j))

          # fit bivariate Poisson lognormal distribution
          est_ij <- try(bipoilogMLE(n1 = cbind(n_ij[, 2], n_ij[, 3])))

          # if estimation works, store the result
          if (class(est_ij) != "try-error"){
            est_ij_tbl <- tibble(mu_1 = est_ij$par[1],
                                 mu_2 = est_ij$par[2],
                                 lsigma_e_1 = log(est_ij$par[3]),
                                 lsigma_e_2 = log(est_ij$par[4]),
                                 us = NA,
                                 p0_hat = 1 - mean(est_ij$p),
                                 e_0 = NA,
                                 rho = est_ij$par[5])

            result_ylo <- bind_rows(result_ylo,
                                     add_column(est_ij_tbl,
                                                y_i = y_i,
                                                l_i = l_i,
                                                obs_i = obs_i,
                                                obs_j = obs_j),
            )
          }

          cat(paste(y_i, l_i, obs_i, obs_j, sep = " / "), " \r")

        }

      }

    }
  }

  return(result_ylo)

}

# auxiliary function: f_distance ####
# takes all bivariate estimates (result_yll and result_yyll) and joins them
# together with their corresponding distance in space or time

f_distance <- function(data, result_yll, result_yyll){

  # matrix of all time distances
  time_mat <- as.matrix(dist(parseNumLevels(unique(data$year_m))))
  colnames(time_mat) <- as.character(unique(data$year_m))
  rownames(time_mat) <- as.character(unique(data$year_m))

  # matrix of all spatial distances
  dist_mat <- as.matrix(dist(parseNumLevels(unique(data$pos_m))))
  colnames(dist_mat) <- as.character(unique(data$pos_m))
  rownames(dist_mat) <- as.character(unique(data$pos_m))

  # all unique time points
  time_yiyj <- result_yyll %>%
    dplyr::select(y_i, y_j) %>%
    distinct()

  # add time distance
  time_yiyj <- time_yiyj %>%
    group_by(y_i, y_j) %>%
    mutate(time = time_mat[which(colnames(time_mat) == y_i),
                           which(rownames(time_mat) == y_j)]) %>%
    ungroup()

  # all unique locations
  dist_lilj <- result_yll %>%
    dplyr::select(l_i, l_j) %>%
    distinct()

  # add spatial distance
  dist_lilj <- dist_lilj %>%
    group_by(l_i, l_j) %>%
    mutate(distance = dist_mat[which(colnames(dist_mat) == l_i),
                               which(rownames(dist_mat) == l_j)]) %>%
    ungroup()

  # add temporal and spatial distance to results
  result_yll <- left_join(result_yll, dist_lilj)
  result_yyll <- left_join(result_yyll, time_yiyj)
  result_yyll <- left_join(result_yyll, dist_lilj)

  # making sure NA distances are zero
  result_yyll <- result_yyll %>%
    group_by(y_i, y_j, l_i, l_j, obs) %>%
    mutate(distance = ifelse(is.na(distance),
                             ifelse(l_i != l_j,
                                    dist_mat[which(colnames(dist_mat) == l_j),
                                             which(rownames(dist_mat) == l_i)],
                                    0),
                             distance)) %>%
    ungroup()

  # making sure NA times are zero and re-organise to match other results
  result_yll <- result_yll %>%
    add_column(y_j = NA,
               time = 0) %>%
    dplyr::select(mu_1, mu_2, lsigma_e_1, lsigma_e_2, us, p0_hat, e_0, rho,
                  y_i, y_j, l_i, l_j, obs, time, distance)

  # join results
  result_all_yyll <- bind_rows(result_yll,
                               result_yyll)

  return(result_all_yyll)
}

# total variance ####
# based on the results from the univariate Poisson lognormal distributions, we
# compute the mean log abundance and mean total variance (and standard errors)

f_tot_var <- function(result_yl){
  result <- tibble(avg_mu = mean(result_yl$mu),
                   sd_mu = sd(result_yl$mu),
                   avg_sigma_sq = mean(exp(result_yl$lsigma_e)^2),
                   sd_sigma_sq = sd(exp(result_yl$lsigma_e)^2))
  result
}

# proportion of variance due to over-dispersion ####
# if available, this function estimates 1 - the correlation at time/distance
# equal to zero, which is the proportion of the total variance attributable
# to over-dispersion
f_prop_error <- function(result_ylo){
  result <- tibble(rho_0 = mean(1 - result_ylo$rho))
  result
}

# correlation function ####
# with all the different correlations estimated at different spatial and
# temporal distances, we fit a exponential curve using least squares
f_rho_fun <- function(result_yyll, prop_error, given = "none"){
  # all correlations and their distances
  rho_data <- result_yyll %>%
    dplyr::select(time, distance, rho)

  # correlation function, to be used in the sum of squares
  cor_f <- function(t, z, par, data, prop_error, given){
    # if there is a proportion of variance due to over-dispersion, reduce the
    # remaining proportion accordingly
    # A is the proportion due to environmental variance
    A = (1 - prop_error) * exp(par[1])/(1 + exp(par[1]))
    # B is the proportion due to species heterogeneity
    B = (1 - prop_error) * exp(par[2])/(1 + exp(par[2]))
    # if temporal correlation is not estimated
    if(given == "time"){
      ct = 0
    }
    else{
      ct = exp(par[3])
    }
    # if spatial correlation is not estimated
    if(given == "space"){
      cz = 0
    }
    else{
      cz = exp(par[4])
    }
    A * exp(-ct * t -cz * z) + B
  }

  # sum of squares function
  ss <- function(par, data, prop_error, given){
    sum((data$rho - cor_f(t = data$time,
                          z = data$distance,
                          par = par,
                          prop_error = prop_error,
                          given = given))^2)
  }

  # numerical optimiser
  est <- nlminb(start = c(1, 0, log(0.001), log(0.2)),
                objective = ss,
                data = rho_data,
                prop_error = prop_error, given = given)

  # calculate parameters
  mle_A <- (1 - prop_error) * exp(est$par[1])/(1 + exp(est$par[1]))
  mle_B <- (1 - prop_error) * exp(est$par[2])/(1 + exp(est$par[2]))
  if(given == "time"){
    mle_ct = 0
  }
  else{
    mle_ct = exp(est$par[3])
  }
  if(given == "space"){
    mle_cz = 0
  }
  else{
    mle_cz = exp(est$par[4])
  }

  return(tibble(prop_env = mle_A,
                prop_het = mle_B,
                delta = mle_ct,
                eta = mle_cz))
}