dynamicSAD:

# This function simulates dynamic species abundance distributions for a given
# list() input which needs the following information:

# example_input <- list(
#   # (n_sp) number of species
#   n_sp = 20,
#   # (n_year) number of time points: can either be
#   # - a scalar, giving the number of time points, or
#   # - a vector of specific time points to be included in the simulation, where
#   #   time = "fixed" needs to be specified, see below
#   n_year = 10,
#   # (n_loc) number of locations: scalar. If locations have specific coordinates
#   # location = "fixed", see below and pos_x and pos_y needs to be specified
#   n_loc = 5,
#   # (n_obs) number of "observers": scalar. the number of repeated measures for
#   # each unique combination of species-location-time point.
#   n_obs = 2,
#   # (mu) mean log abundance of species in the community, in practice this
#   # parameter is confounded with the sampling intensity
#   mu = 0,
#   # (mu_sigma) variance in mean log abundance between each location-time
#   # point. Need to set "var_mu = TRUE" to be used
#   mu_sigma = 0,
#   # (sigma_r_sq) species heterogeneity: this is the variance in species
#   # abundance distribution due to variation in log carrying capacity, in the
#   # text this is referred to as $\sigma_r^2/\delta^2$
#   sigma_r_sq = 2,
#   # (sigma_s_sq) species specific environmental variance: this is the variance
#   # in species abundance distribution due to variation between years, in the
#   # text this is referred to as $\sigma_s^2/2\delta$
#   sigma_s_sq = 1,
#   # (sigma_d_sq) over-dispersion: notation is terrible due to the fact that
#   # over-dispersion is also confounded with demographic variance, in the text
#   # this is referred to as $\theta^2$, or $\sigma_d^2/2\bar{K}\delta+\theta^2$
#   sigma_d_sq = 0.5,
#   # (delta) strength of density regulation: the inverse mean return time to
#   # equilibrium and the exponential rate of decay in the temporal dimension
#   delta = 0.2,
#   # (eta) inverse spatial scaling: here we assume spatial scaling is
#   # exponentially decreasing
#   eta = 0.3,
#   # (pos_x) position of location on the x-axis: if specific locations are used
#   # they need to be specified with a vector of x-coordinates, if NULL, it is
#   # assumed locations are located along a diagonal line, i.e. (1,1), (2,2) etc
#   pos_x = NULL,
#   # (pos_x) position of location on the x-axis: if specific locations are used
#   # they need to be specified with a vector of x-coordinates, if NULL, it is
#   # assumed locations are located along a diagonal line, i.e. (1,1), (2,2) etc
#   pos_y = NULL)

# Un-comment the list above to test the simulation function

# required packages

# library(Matrix)
# library(MASS)
# library(tidyverse)

# just load "required_packages.R"

# f_sim_sad_fast: function simulate species abundance distributions fast (the
# old version was slow:))

# input: see above
# time: "seq" = time points are a sequence, going from 1 to (n_year),
#       "fixed" = if (n_year) is a vector of time points
# space: "seq" = locations are a sequence, going from (1, 1) to (n_loc, n_loc)
#        "fixed" = if (pos_x) and (pos_y) are specified
# dependency: "none" = regular simulation
#             "time" = used when simulation data when estimating temporal
#                      correlation and each location is assumed independent
#                      of each other
#             "space" = used when simulation data when estimating spatial
#                       correlation and each time point is assumed independent
#                       of each other
# var_mu: FALSE = mean log abundance (mu) is assumed to be the same for all
#                 location-time points
#         TRUE = mean log abundance (mu) is assumed to vary between location-
#                time points with variance (mu_sigma)

f_sim_sad_fast <- function(input,
                           time = "seq",
                           space = "seq",
                           dependency = "none",
                           var_mu = FALSE){

  # Checking if locations or time points are fixed or sequential ("seq")

  if (time == "seq"){
    s_year <- 1:input$n_year
    l_year <- input$n_year
  }
  else{ # i.e. time = "fixed"
    s_year <- input$n_year
    l_year <- length(input$n_year)
  }

  if (space == "seq"){
    pos_x <- 1:input$n_loc
    pos_y <- 1:input$n_loc
    s_loc <- 1:input$n_loc
    l_loc <- input$n_loc
  }
  else{
    pos_x <- input$pos_x
    pos_y <- input$pos_y
    s_loc <- 1:length(input$pos_x)
    l_loc <- length(s_loc)
  }

  # spatio-temporal correlation

  # The first is regular spatio-temporal correlation, the other two are used
  # when simulation distributions conditioning on either space or time
  if (dependency == "none"){

    sigma_sim <- kronecker(
      input$sigma_s_sq * exp(- input$delta * as.matrix(dist(s_year))),
      exp(- input$eta * as.matrix(dist(cbind(pos_x, pos_y)))),
      FUN = "*") +
      input$sigma_r_sq

    mu_sim <- rep(input$mu, l_year * l_loc)

  }
  else if (dependency == "time"){

    sigma_sim <- kronecker(
      input$sigma_s_sq * exp(- input$delta * as.matrix(dist(s_year))) +
        input$sigma_r_sq,
      diag(nrow = l_loc),
      FUN = "*")

    mu_sim <- rep(input$mu, l_year * l_loc)

  }
  else if (dependency == "space"){

    sigma_sim <- kronecker(
      diag(nrow = l_year),
      input$sigma_s_sq * exp(- input$eta *
                               as.matrix(dist(cbind(pos_x, pos_y)))) +
        input$sigma_r_sq,
      FUN = "*")

    mu_sim <- rep(input$mu, l_year * l_loc)

  }

  # Checking and simulating additional variation between space-time-points

  if (!var_mu){
    x_sim <- mvrnorm(n = input$n_sp,
                     mu = mu_sim,
                     Sigma = sigma_sim)
  }
  else{
    x_sim <- mvrnorm(n = input$n_sp,
                     mu = rnorm(l_year * l_loc, mu_sim, sqrt(input$mu_sigma)),
                     Sigma = sigma_sim)
  }

  # naming columns as combinations of location and time
  colnames(x_sim) <- paste(rep(s_loc, l_year),
                           rep(s_year, each = l_loc), sep = ".")

  # naming rows as species
  rownames(x_sim) <- 1:input$n_sp

  # make table
  x_sim_tbl <- as_tibble(x_sim)

  # make species column
  x_sim_tbl <- x_sim_tbl %>%
    rownames_to_column(var = "species")

  # organise table
  x_sim_tbl <- x_sim_tbl %>%
    pivot_longer(cols = -species,
                 names_to = "location.year",
                 values_to = "log_abundance")

  # add over-dispersion
  error <- rnorm(input$n_obs*nrow(x_sim_tbl),
                 mean = x_sim_tbl$log_abundance,
                 sd = sqrt(input$sigma_d_sq))

  # re-organise table again!
  x_sim_tbl <- tibble(
    species = rep(x_sim_tbl$species, input$n_obs),
    location.year = rep(x_sim_tbl$location.year, input$n_obs),
    log_abundance = rep(x_sim_tbl$log_abundance, input$n_obs),
    observer = as.character(rep(1:input$n_obs, each = nrow(x_sim_tbl))),
    log_abundance_error = error)

  # Generate abundances
  y_sim_tbl <- x_sim_tbl %>%
    mutate(abundance = rpois(nrow(x_sim_tbl),
                             lambda = exp(log_abundance_error)))

  # associate each year and location to each other
  year_loc_tbl <- tibble(year = rep(s_year, each = l_loc),
                         location = rep(s_loc, l_year),
                         location.year = paste(location, year, sep = "."))

  # add year and location as variable
  y_sim_tbl <- left_join(y_sim_tbl, year_loc_tbl, by = "location.year")

  # create temporal variable in the format of glmmTMB
  y_sim_tbl <- y_sim_tbl %>%
    arrange(year) %>%
    mutate(year_m = numFactor(as.numeric(year)))

  # associate each location to a position
  loc_tbl <- tibble(location = s_loc,
                    pos_x = pos_x,
                    pos_y = pos_y)

  # add position to data table
  y_sim_tbl <- left_join(y_sim_tbl, loc_tbl, by = "location")

  # create spatial variable in the format of glmmTMB
  y_sim_tbl <- y_sim_tbl %>%
    mutate(pos_m = numFactor(pos_x, pos_y))

  return(y_sim_tbl)
}